//
// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
//
// SPDX-License-Identifier: Apache-2.0
//

// Do not flag up inline assembly blocks
#pragma GCC diagnostic ignored "-Woverlength-strings"

#if !defined(__aarch64__) || !defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) || \
    !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
#error This file must be compiled for AArch64, FEAT_FP16.
#else  // Architectural features check.

#include "kai_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla.h"

#include <arm_neon.h>
#include <stddef.h>
#include <stdint.h>

#include "kai/kai_common.h"

static const size_t kai_mr = 6;
static const size_t kai_nr = 16;
static const size_t kai_kr = 1;
static const size_t kai_sr = 1;

size_t kai_get_m_step_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla(void) {
    return kai_mr;
}

size_t kai_get_n_step_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla(void) {
    return kai_nr;
}

size_t kai_get_nr_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla(void) {
    return kai_nr;
}

size_t kai_get_kr_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla(void) {
    return kai_kr;
}

size_t kai_get_sr_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla(void) {
    return kai_sr;
}

size_t kai_get_lhs_offset_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla(size_t m_idx, size_t stride) {
    KAI_ASSUME(m_idx % kai_mr == 0);

    return m_idx * stride;
}

size_t kai_get_rhs_packed_offset_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla(size_t n_idx, size_t k) {
    KAI_ASSUME(n_idx % kai_nr == 0);

    return n_idx / kai_nr * (kai_nr * sizeof(uint16_t) + kai_nr * k * sizeof(uint16_t));
}

size_t kai_get_dst_offset_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla(
    size_t m_idx, size_t n_idx, size_t stride) {
    KAI_ASSUME(m_idx % kai_mr == 0);
    KAI_ASSUME(n_idx % kai_nr == 0);

    return m_idx * stride + n_idx * sizeof(uint16_t);
}

size_t kai_get_dst_size_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla(size_t m, size_t n) {
    return m * n * sizeof(uint16_t);
}

void kai_run_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla(
    size_t m, size_t n, size_t k,                             //
    const void* lhs, size_t lhs_stride,                       //
    const void* rhs_packed,                                   //
    void* dst, size_t dst_stride_row, size_t dst_stride_col,  //
    float clamp_min, float clamp_max) {
    KAI_ASSERT(dst_stride_col == sizeof(uint16_t));

    typedef struct {
        float16_t maxval;
        float16_t minval;
        unsigned int num_strings;
        const unsigned int* string_lengths;
        size_t N;
        const void* B_ptr;
        size_t output_offset;
        size_t input_initial_col;
        size_t input_offset;
        void* output_ptr;
        const void* bias;
    } KernelArgs;

    KernelArgs ka;

    unsigned long flags = 0;

    unsigned int string_length = k;
    ka.num_strings = 1;
    ka.string_lengths = &string_length;
    ka.N = n;
    ka.B_ptr = rhs_packed;
    ka.bias = NULL;

    // Direct input.
    const void* input_ptr = lhs;
    ka.input_offset = lhs_stride / sizeof(uint16_t);
    ka.input_initial_col = 0;

    // Direct output.
    ka.output_ptr = dst;
    ka.output_offset = dst_stride_row / sizeof(uint16_t);

    // Clamping output.
    flags |= 0x2;
    ka.maxval = (float16_t)clamp_max;
    ka.minval = (float16_t)clamp_min;

    __asm__ __volatile__(
        "1:"  // Row loop
        "cmp %x[m], #0x6\n"
        "bge 166f\n"
        "cmp %x[m], #0x4\n"
        "bgt 133f\n"
        "beq 100f\n"
        "cmp %x[m], #0x2\n"
        "bgt 67f\n"
        "beq 34f\n"
        "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
        "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
        "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
        "2:"  // Height 1: Column loop
        "cbz x10, 3f\n"
        "ldr q20, [x10, #0x0]\n"
        "ldr q21, [x10, #0x10]\n"
        "add x10, x10, #0x20\n"
        "b 14f\n"
        "3:"  // Height 1: no bias
        "tbz %x[flags], #0, 13f\n"
        "cmp x11, #0x10\n"
        "bge 12f\n"
        "tbz x11, #3, 7f\n"
        "ld1 { v20.8h }, [x9], #0x10\n"
        "tbz x11, #2, 5f\n"
        "ldr d21, [x9], #0x8\n"
        "tbz x11, #1, 4f\n"
        "ld1 { v21.s }[2], [x9], #0x4\n"
        "mov x20, #0x1c\n"
        "tbz x11, #0, 11f\n"
        "ld1 { v21.h }[6], [x9]\n"
        "b 11f\n"
        "4:"  // Height 1: Partial accumulate: partial_1_12
        "mov x20, #0x18\n"
        "tbz x11, #0, 11f\n"
        "ld1 { v21.h }[4], [x9]\n"
        "b 11f\n"
        "5:"  // Height 1: Partial accumulate: partial_2_8
        "tbz x11, #1, 6f\n"
        "ldr s21, [x9], #0x4\n"
        "mov x20, #0x14\n"
        "tbz x11, #0, 11f\n"
        "ld1 { v21.h }[2], [x9]\n"
        "b 11f\n"
        "6:"  // Height 1: Partial accumulate: partial_1_8
        "mov x20, #0x10\n"
        "tbz x11, #0, 11f\n"
        "ldr h21, [x9, #0x0]\n"
        "b 11f\n"
        "7:"  // Height 1: Partial accumulate: partial_4_0
        "tbz x11, #2, 9f\n"
        "ldr d20, [x9], #0x8\n"
        "tbz x11, #1, 8f\n"
        "ld1 { v20.s }[2], [x9], #0x4\n"
        "mov x20, #0xc\n"
        "tbz x11, #0, 11f\n"
        "ld1 { v20.h }[6], [x9]\n"
        "b 11f\n"
        "8:"  // Height 1: Partial accumulate: partial_1_4
        "mov x20, #0x8\n"
        "tbz x11, #0, 11f\n"
        "ld1 { v20.h }[4], [x9]\n"
        "b 11f\n"
        "9:"  // Height 1: Partial accumulate: partial_2_0
        "tbz x11, #1, 10f\n"
        "ldr s20, [x9], #0x4\n"
        "mov x20, #0x4\n"
        "tbz x11, #0, 11f\n"
        "ld1 { v20.h }[2], [x9]\n"
        "b 11f\n"
        "10:"  // Height 1: Partial accumulate: partial_1_0
        "ldr h20, [x9, #0x0]\n"
        "mov x20, #0x0\n"
        "11:"  // Height 1: Partial accumulate: Done
        "sub x9, x9, x20\n"
        "b 14f\n"
        "12:"  // Height 1: full accumulate
        "ldr q20, [x9, #0x0]\n"
        "ldr q21, [x9, #0x10]\n"
        "b 14f\n"
        "13:"  // Height 1: no accumulate
        "movi v20.16b, #0x0\n"
        "movi v21.16b, #0x0\n"
        "14:"  // Height 1: setup done
        "mov x28, #0x0\n"
        "15:"  // Height 1: String loop
        "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
        "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
        "ldr w27, [x20, x28, LSL #0x2]\n"
        "tbz %x[flags], #3, 16f\n"
        "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
        "add x20, x20, x21, LSL #3\n"
        "ldr x26, [x20, #0x0]\n"
        "cbnz x28, 17f\n"
        "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
        "add x26, x26, x20, LSL #1\n"
        "b 17f\n"
        "16:"  // Height 1: setup direct input
        "mov x26, %x[input_ptr]\n"
        "17:"  // Height 1: input setup done
        "cmp x27, #0x8\n"
        "blt 20f\n"
        "ldr q0, [x26, #0x0]\n"
        "ldr q6, [x10, #0x0]\n"
        "cmp x27, #0x10\n"
        "ldr q7, [x10, #0x10]\n"
        "ldr q8, [x10, #0x20]\n"
        "ldr q9, [x10, #0x30]\n"
        "ldr q10, [x10, #0x40]\n"
        "ldr q11, [x10, #0x50]\n"
        "ldr q12, [x10, #0x60]\n"
        "ldr q13, [x10, #0x70]\n"
        "ldr q14, [x10, #0x80]\n"
        "ldr q15, [x10, #0x90]\n"
        "ldr q16, [x10, #0xa0]\n"
        "ldr q17, [x10, #0xb0]\n"
        "ldr q18, [x10, #0xc0]\n"
        "ldr q19, [x10, #0xd0]\n"
        "blt 19f\n"
        "18:"  // Height 1: Multiply loop: Main loop head
        "fmla v20.8h, v6.8h, v0.h[0]\n"
        "ldr q6, [x10, #0xe0]\n"
        "fmla v21.8h, v7.8h, v0.h[0]\n"
        "ldr q7, [x10, #0xf0]\n"
        "sub x27, x27, #0x8\n"
        "add x26, x26, #0x10\n"
        "cmp x27, #0x10\n"
        "add x10, x10, #0x100\n"
        "prfm pldl1keep, [x26, #0x80]\n"
        "fmla v20.8h, v8.8h, v0.h[1]\n"
        "ldr q8, [x10, #0x20]\n"
        "fmla v21.8h, v9.8h, v0.h[1]\n"
        "ldr q9, [x10, #0x30]\n"
        "fmla v20.8h, v10.8h, v0.h[2]\n"
        "ldr q10, [x10, #0x40]\n"
        "fmla v21.8h, v11.8h, v0.h[2]\n"
        "ldr q11, [x10, #0x50]\n"
        "fmla v20.8h, v12.8h, v0.h[3]\n"
        "ldr q12, [x10, #0x60]\n"
        "fmla v21.8h, v13.8h, v0.h[3]\n"
        "ldr q13, [x10, #0x70]\n"
        "fmla v20.8h, v14.8h, v0.h[4]\n"
        "ldr q14, [x10, #0x80]\n"
        "fmla v21.8h, v15.8h, v0.h[4]\n"
        "ldr q15, [x10, #0x90]\n"
        "fmla v20.8h, v16.8h, v0.h[5]\n"
        "ldr q16, [x10, #0xa0]\n"
        "fmla v21.8h, v17.8h, v0.h[5]\n"
        "ldr q17, [x10, #0xb0]\n"
        "fmla v20.8h, v18.8h, v0.h[6]\n"
        "ldr q18, [x10, #0xc0]\n"
        "fmla v21.8h, v19.8h, v0.h[6]\n"
        "ldr q19, [x10, #0xd0]\n"
        "fmla v20.8h, v6.8h, v0.h[7]\n"
        "ldr q6, [x10, #0x0]\n"
        "fmla v21.8h, v7.8h, v0.h[7]\n"
        "ldr q0, [x26, #0x0]\n"
        "ldr q7, [x10, #0x10]\n"
        "bge 18b\n"
        "19:"  // Height 1: Multiply loop: Single iteration only
        "fmla v20.8h, v6.8h, v0.h[0]\n"
        "ldr q6, [x10, #0xe0]\n"
        "fmla v21.8h, v7.8h, v0.h[0]\n"
        "ldr q7, [x10, #0xf0]\n"
        "add x26, x26, #0x10\n"
        "sub x27, x27, #0x8\n"
        "add x10, x10, #0x100\n"
        "prfm pldl1keep, [x26, #0x80]\n"
        "fmla v20.8h, v8.8h, v0.h[1]\n"
        "fmla v21.8h, v9.8h, v0.h[1]\n"
        "fmla v20.8h, v10.8h, v0.h[2]\n"
        "fmla v21.8h, v11.8h, v0.h[2]\n"
        "fmla v20.8h, v12.8h, v0.h[3]\n"
        "fmla v21.8h, v13.8h, v0.h[3]\n"
        "fmla v20.8h, v14.8h, v0.h[4]\n"
        "fmla v21.8h, v15.8h, v0.h[4]\n"
        "fmla v20.8h, v16.8h, v0.h[5]\n"
        "fmla v21.8h, v17.8h, v0.h[5]\n"
        "fmla v20.8h, v18.8h, v0.h[6]\n"
        "fmla v21.8h, v19.8h, v0.h[6]\n"
        "fmla v20.8h, v6.8h, v0.h[7]\n"
        "fmla v21.8h, v7.8h, v0.h[7]\n"
        "20:"  // Height 1: Multiply loop: Main loop skip
        "cbz x27, 22f\n"
        "21:"  // Height 1: Multiply loop: Odd block loop
        "ldr h0, [x26], #0x2\n"
        "ldr q8, [x10, #0x0]\n"
        "sub x27, x27, #0x1\n"
        "ldr q9, [x10, #0x10]\n"
        "add x10, x10, #0x20\n"
        "fmla v20.8h, v8.8h, v0.h[0]\n"
        "fmla v21.8h, v9.8h, v0.h[0]\n"
        "cbnz x27, 21b\n"
        "22:"  // Height 1: Multiply loop: No odd multiplies
        "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
        "add x28, x28, #0x1\n"
        "cmp x28, x20\n"
        "bne 15b\n"
        "prfm pstl1keep, [x9, #0x0]\n"
        "tbz %x[flags], #1, 23f\n"
        "add x21, %x[args_ptr], %[offset_max]\n"
        "add x20, %x[args_ptr], %[offset_min]\n"
        "ld1r { v17.8h }, [x21]\n"
        "ld1r { v16.8h }, [x20]\n"
        "fmin v20.8h, v20.8h, v17.8h\n"
        "fmin v21.8h, v21.8h, v17.8h\n"
        "fmax v20.8h, v20.8h, v16.8h\n"
        "fmax v21.8h, v21.8h, v16.8h\n"
        "23:"  // Height 1: No activation
        "cmp x11, #0x10\n"
        "bge 32f\n"
        "tbz x11, #3, 27f\n"
        "st1 { v20.8h }, [x9], #0x10\n"
        "tbz x11, #2, 25f\n"
        "str d21, [x9], #0x8\n"
        "tbz x11, #1, 24f\n"
        "st1 { v21.s }[2], [x9], #0x4\n"
        "tbz x11, #0, 31f\n"
        "st1 { v21.h }[6], [x9]\n"
        "b 31f\n"
        "24:"  // Height 1: Partial direct writeback: partial_1_12
        "tbz x11, #0, 31f\n"
        "st1 { v21.h }[4], [x9]\n"
        "b 31f\n"
        "25:"  // Height 1: Partial direct writeback: partial_2_8
        "tbz x11, #1, 26f\n"
        "str s21, [x9], #0x4\n"
        "tbz x11, #0, 31f\n"
        "st1 { v21.h }[2], [x9]\n"
        "b 31f\n"
        "26:"  // Height 1: Partial direct writeback: partial_1_8
        "tbz x11, #0, 31f\n"
        "str h21, [x9, #0x0]\n"
        "b 31f\n"
        "27:"  // Height 1: Partial direct writeback: partial_4_0
        "tbz x11, #2, 29f\n"
        "str d20, [x9], #0x8\n"
        "tbz x11, #1, 28f\n"
        "st1 { v20.s }[2], [x9], #0x4\n"
        "tbz x11, #0, 31f\n"
        "st1 { v20.h }[6], [x9]\n"
        "b 31f\n"
        "28:"  // Height 1: Partial direct writeback: partial_1_4
        "tbz x11, #0, 31f\n"
        "st1 { v20.h }[4], [x9]\n"
        "b 31f\n"
        "29:"  // Height 1: Partial direct writeback: partial_2_0
        "tbz x11, #1, 30f\n"
        "str s20, [x9], #0x4\n"
        "tbz x11, #0, 31f\n"
        "st1 { v20.h }[2], [x9]\n"
        "b 31f\n"
        "30:"  // Height 1: Partial direct writeback: partial_1_0
        "str h20, [x9, #0x0]\n"
        "31:"  // Height 1: Partial direct writeback: Done
        "b 33f\n"
        "32:"  // Height 1: Full writeback
        "str q20, [x9, #0x0]\n"
        "str q21, [x9, #0x10]\n"
        "add x9, x9, #0x20\n"
        "33:"  // Height 1: Writeback done
        "subs x11, x11, #0x10\n"
        "bgt 2b\n"
        "b 200f\n"
        "34:"  // Height 2
        "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
        "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
        "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
        "35:"  // Height 2: Column loop
        "cbz x10, 36f\n"
        "ldr q20, [x10, #0x0]\n"
        "ldr q21, [x10, #0x10]\n"
        "add x10, x10, #0x20\n"
        "mov v22.16b, v20.16b\n"
        "mov v23.16b, v21.16b\n"
        "b 47f\n"
        "36:"  // Height 2: no bias
        "tbz %x[flags], #0, 46f\n"
        "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
        "cmp x11, #0x10\n"
        "add x26, x9, x20, LSL #1\n"
        "bge 45f\n"
        "tbz x11, #3, 40f\n"
        "ld1 { v20.8h }, [x9], #0x10\n"
        "ld1 { v22.8h }, [x26], #0x10\n"
        "tbz x11, #2, 38f\n"
        "ldr d21, [x9], #0x8\n"
        "ldr d23, [x26], #0x8\n"
        "tbz x11, #1, 37f\n"
        "ld1 { v21.s }[2], [x9], #0x4\n"
        "ld1 { v23.s }[2], [x26], #0x4\n"
        "mov x20, #0x1c\n"
        "tbz x11, #0, 44f\n"
        "ld1 { v21.h }[6], [x9]\n"
        "ld1 { v23.h }[6], [x26]\n"
        "b 44f\n"
        "37:"  // Height 2: Partial accumulate: partial_1_12
        "mov x20, #0x18\n"
        "tbz x11, #0, 44f\n"
        "ld1 { v21.h }[4], [x9]\n"
        "ld1 { v23.h }[4], [x26]\n"
        "b 44f\n"
        "38:"  // Height 2: Partial accumulate: partial_2_8
        "tbz x11, #1, 39f\n"
        "ldr s21, [x9], #0x4\n"
        "ldr s23, [x26], #0x4\n"
        "mov x20, #0x14\n"
        "tbz x11, #0, 44f\n"
        "ld1 { v21.h }[2], [x9]\n"
        "ld1 { v23.h }[2], [x26]\n"
        "b 44f\n"
        "39:"  // Height 2: Partial accumulate: partial_1_8
        "mov x20, #0x10\n"
        "tbz x11, #0, 44f\n"
        "ldr h21, [x9, #0x0]\n"
        "ldr h23, [x26, #0x0]\n"
        "b 44f\n"
        "40:"  // Height 2: Partial accumulate: partial_4_0
        "tbz x11, #2, 42f\n"
        "ldr d20, [x9], #0x8\n"
        "ldr d22, [x26], #0x8\n"
        "tbz x11, #1, 41f\n"
        "ld1 { v20.s }[2], [x9], #0x4\n"
        "ld1 { v22.s }[2], [x26], #0x4\n"
        "mov x20, #0xc\n"
        "tbz x11, #0, 44f\n"
        "ld1 { v20.h }[6], [x9]\n"
        "ld1 { v22.h }[6], [x26]\n"
        "b 44f\n"
        "41:"  // Height 2: Partial accumulate: partial_1_4
        "mov x20, #0x8\n"
        "tbz x11, #0, 44f\n"
        "ld1 { v20.h }[4], [x9]\n"
        "ld1 { v22.h }[4], [x26]\n"
        "b 44f\n"
        "42:"  // Height 2: Partial accumulate: partial_2_0
        "tbz x11, #1, 43f\n"
        "ldr s20, [x9], #0x4\n"
        "ldr s22, [x26], #0x4\n"
        "mov x20, #0x4\n"
        "tbz x11, #0, 44f\n"
        "ld1 { v20.h }[2], [x9]\n"
        "ld1 { v22.h }[2], [x26]\n"
        "b 44f\n"
        "43:"  // Height 2: Partial accumulate: partial_1_0
        "ldr h20, [x9, #0x0]\n"
        "ldr h22, [x26, #0x0]\n"
        "mov x20, #0x0\n"
        "44:"  // Height 2: Partial accumulate: Done
        "sub x9, x9, x20\n"
        "b 47f\n"
        "45:"  // Height 2: full accumulate
        "ldr q20, [x9, #0x0]\n"
        "ldr q21, [x9, #0x10]\n"
        "ldr q22, [x26, #0x0]\n"
        "ldr q23, [x26, #0x10]\n"
        "b 47f\n"
        "46:"  // Height 2: no accumulate
        "movi v20.16b, #0x0\n"
        "movi v21.16b, #0x0\n"
        "movi v22.16b, #0x0\n"
        "movi v23.16b, #0x0\n"
        "47:"  // Height 2: setup done
        "mov x28, #0x0\n"
        "48:"  // Height 2: String loop
        "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
        "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
        "ldr w27, [x20, x28, LSL #0x2]\n"
        "tbz %x[flags], #3, 49f\n"
        "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
        "add x20, x20, x21, LSL #3\n"
        "ldr x26, [x20, #0x0]\n"
        "ldr x25, [x20, #0x8]\n"
        "cbnz x28, 50f\n"
        "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
        "add x26, x26, x20, LSL #1\n"
        "add x25, x25, x20, LSL #1\n"
        "b 50f\n"
        "49:"  // Height 2: setup direct input
        "mov x26, %x[input_ptr]\n"
        "add x25, x26, x21, LSL #1\n"
        "50:"  // Height 2: input setup done
        "cmp x27, #0x8\n"
        "blt 53f\n"
        "ldr q0, [x26, #0x0]\n"
        "ldr q1, [x25, #0x0]\n"
        "cmp x27, #0x10\n"
        "ldr q6, [x10, #0x0]\n"
        "ldr q7, [x10, #0x10]\n"
        "ldr q8, [x10, #0x20]\n"
        "ldr q9, [x10, #0x30]\n"
        "ldr q10, [x10, #0x40]\n"
        "ldr q11, [x10, #0x50]\n"
        "ldr q12, [x10, #0x60]\n"
        "ldr q13, [x10, #0x70]\n"
        "ldr q14, [x10, #0x80]\n"
        "ldr q15, [x10, #0x90]\n"
        "ldr q16, [x10, #0xa0]\n"
        "ldr q17, [x10, #0xb0]\n"
        "ldr q18, [x10, #0xc0]\n"
        "ldr q19, [x10, #0xd0]\n"
        "blt 52f\n"
        "51:"  // Height 2: Multiply loop: Main loop head
        "fmla v20.8h, v6.8h, v0.h[0]\n"
        "fmla v22.8h, v6.8h, v1.h[0]\n"
        "ldr q6, [x10, #0xe0]\n"
        "sub x27, x27, #0x8\n"
        "fmla v21.8h, v7.8h, v0.h[0]\n"
        "fmla v23.8h, v7.8h, v1.h[0]\n"
        "ldr q7, [x10, #0xf0]\n"
        "add x26, x26, #0x10\n"
        "add x25, x25, #0x10\n"
        "cmp x27, #0x10\n"
        "prfm pldl1keep, [x26, #0x80]\n"
        "add x10, x10, #0x100\n"
        "prfm pldl1keep, [x25, #0x80]\n"
        "fmla v20.8h, v8.8h, v0.h[1]\n"
        "fmla v22.8h, v8.8h, v1.h[1]\n"
        "ldr q8, [x10, #0x20]\n"
        "fmla v21.8h, v9.8h, v0.h[1]\n"
        "fmla v23.8h, v9.8h, v1.h[1]\n"
        "ldr q9, [x10, #0x30]\n"
        "fmla v20.8h, v10.8h, v0.h[2]\n"
        "fmla v22.8h, v10.8h, v1.h[2]\n"
        "ldr q10, [x10, #0x40]\n"
        "fmla v21.8h, v11.8h, v0.h[2]\n"
        "fmla v23.8h, v11.8h, v1.h[2]\n"
        "ldr q11, [x10, #0x50]\n"
        "fmla v20.8h, v12.8h, v0.h[3]\n"
        "fmla v22.8h, v12.8h, v1.h[3]\n"
        "ldr q12, [x10, #0x60]\n"
        "fmla v21.8h, v13.8h, v0.h[3]\n"
        "fmla v23.8h, v13.8h, v1.h[3]\n"
        "ldr q13, [x10, #0x70]\n"
        "fmla v20.8h, v14.8h, v0.h[4]\n"
        "fmla v22.8h, v14.8h, v1.h[4]\n"
        "ldr q14, [x10, #0x80]\n"
        "fmla v21.8h, v15.8h, v0.h[4]\n"
        "fmla v23.8h, v15.8h, v1.h[4]\n"
        "ldr q15, [x10, #0x90]\n"
        "fmla v20.8h, v16.8h, v0.h[5]\n"
        "fmla v22.8h, v16.8h, v1.h[5]\n"
        "ldr q16, [x10, #0xa0]\n"
        "fmla v21.8h, v17.8h, v0.h[5]\n"
        "fmla v23.8h, v17.8h, v1.h[5]\n"
        "ldr q17, [x10, #0xb0]\n"
        "fmla v20.8h, v18.8h, v0.h[6]\n"
        "fmla v22.8h, v18.8h, v1.h[6]\n"
        "ldr q18, [x10, #0xc0]\n"
        "fmla v21.8h, v19.8h, v0.h[6]\n"
        "fmla v23.8h, v19.8h, v1.h[6]\n"
        "ldr q19, [x10, #0xd0]\n"
        "fmla v20.8h, v6.8h, v0.h[7]\n"
        "fmla v22.8h, v6.8h, v1.h[7]\n"
        "ldr q6, [x10, #0x0]\n"
        "fmla v21.8h, v7.8h, v0.h[7]\n"
        "ldr q0, [x26, #0x0]\n"
        "fmla v23.8h, v7.8h, v1.h[7]\n"
        "ldr q1, [x25, #0x0]\n"
        "ldr q7, [x10, #0x10]\n"
        "bge 51b\n"
        "52:"  // Height 2: Multiply loop: Single iteration only
        "fmla v20.8h, v6.8h, v0.h[0]\n"
        "fmla v22.8h, v6.8h, v1.h[0]\n"
        "ldr q6, [x10, #0xe0]\n"
        "add x26, x26, #0x10\n"
        "fmla v21.8h, v7.8h, v0.h[0]\n"
        "fmla v23.8h, v7.8h, v1.h[0]\n"
        "ldr q7, [x10, #0xf0]\n"
        "add x25, x25, #0x10\n"
        "sub x27, x27, #0x8\n"
        "prfm pldl1keep, [x26, #0x80]\n"
        "add x10, x10, #0x100\n"
        "prfm pldl1keep, [x25, #0x80]\n"
        "fmla v20.8h, v8.8h, v0.h[1]\n"
        "fmla v22.8h, v8.8h, v1.h[1]\n"
        "fmla v21.8h, v9.8h, v0.h[1]\n"
        "fmla v23.8h, v9.8h, v1.h[1]\n"
        "fmla v20.8h, v10.8h, v0.h[2]\n"
        "fmla v22.8h, v10.8h, v1.h[2]\n"
        "fmla v21.8h, v11.8h, v0.h[2]\n"
        "fmla v23.8h, v11.8h, v1.h[2]\n"
        "fmla v20.8h, v12.8h, v0.h[3]\n"
        "fmla v22.8h, v12.8h, v1.h[3]\n"
        "fmla v21.8h, v13.8h, v0.h[3]\n"
        "fmla v23.8h, v13.8h, v1.h[3]\n"
        "fmla v20.8h, v14.8h, v0.h[4]\n"
        "fmla v22.8h, v14.8h, v1.h[4]\n"
        "fmla v21.8h, v15.8h, v0.h[4]\n"
        "fmla v23.8h, v15.8h, v1.h[4]\n"
        "fmla v20.8h, v16.8h, v0.h[5]\n"
        "fmla v22.8h, v16.8h, v1.h[5]\n"
        "fmla v21.8h, v17.8h, v0.h[5]\n"
        "fmla v23.8h, v17.8h, v1.h[5]\n"
        "fmla v20.8h, v18.8h, v0.h[6]\n"
        "fmla v22.8h, v18.8h, v1.h[6]\n"
        "fmla v21.8h, v19.8h, v0.h[6]\n"
        "fmla v23.8h, v19.8h, v1.h[6]\n"
        "fmla v20.8h, v6.8h, v0.h[7]\n"
        "fmla v22.8h, v6.8h, v1.h[7]\n"
        "fmla v21.8h, v7.8h, v0.h[7]\n"
        "fmla v23.8h, v7.8h, v1.h[7]\n"
        "53:"  // Height 2: Multiply loop: Main loop skip
        "cbz x27, 55f\n"
        "54:"  // Height 2: Multiply loop: Odd block loop
        "ldr h0, [x26], #0x2\n"
        "ldr h1, [x25], #0x2\n"
        "sub x27, x27, #0x1\n"
        "ldr q8, [x10, #0x0]\n"
        "ldr q9, [x10, #0x10]\n"
        "add x10, x10, #0x20\n"
        "fmla v20.8h, v8.8h, v0.h[0]\n"
        "fmla v22.8h, v8.8h, v1.h[0]\n"
        "fmla v21.8h, v9.8h, v0.h[0]\n"
        "fmla v23.8h, v9.8h, v1.h[0]\n"
        "cbnz x27, 54b\n"
        "55:"  // Height 2: Multiply loop: No odd multiplies
        "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
        "add x28, x28, #0x1\n"
        "cmp x28, x20\n"
        "bne 48b\n"
        "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
        "prfm pstl1keep, [x9, #0x0]\n"
        "add x26, x9, x20, LSL #1\n"
        "prfm pstl1keep, [x26, #0x0]\n"
        "tbz %x[flags], #1, 56f\n"
        "add x21, %x[args_ptr], %[offset_max]\n"
        "add x20, %x[args_ptr], %[offset_min]\n"
        "ld1r { v17.8h }, [x21]\n"
        "ld1r { v16.8h }, [x20]\n"
        "fmin v20.8h, v20.8h, v17.8h\n"
        "fmin v21.8h, v21.8h, v17.8h\n"
        "fmin v22.8h, v22.8h, v17.8h\n"
        "fmin v23.8h, v23.8h, v17.8h\n"
        "fmax v20.8h, v20.8h, v16.8h\n"
        "fmax v21.8h, v21.8h, v16.8h\n"
        "fmax v22.8h, v22.8h, v16.8h\n"
        "fmax v23.8h, v23.8h, v16.8h\n"
        "56:"  // Height 2: No activation
        "cmp x11, #0x10\n"
        "bge 65f\n"
        "tbz x11, #3, 60f\n"
        "st1 { v20.8h }, [x9], #0x10\n"
        "st1 { v22.8h }, [x26], #0x10\n"
        "tbz x11, #2, 58f\n"
        "str d21, [x9], #0x8\n"
        "str d23, [x26], #0x8\n"
        "tbz x11, #1, 57f\n"
        "st1 { v21.s }[2], [x9], #0x4\n"
        "st1 { v23.s }[2], [x26], #0x4\n"
        "tbz x11, #0, 64f\n"
        "st1 { v21.h }[6], [x9]\n"
        "st1 { v23.h }[6], [x26]\n"
        "b 64f\n"
        "57:"  // Height 2: Partial direct writeback: partial_1_12
        "tbz x11, #0, 64f\n"
        "st1 { v21.h }[4], [x9]\n"
        "st1 { v23.h }[4], [x26]\n"
        "b 64f\n"
        "58:"  // Height 2: Partial direct writeback: partial_2_8
        "tbz x11, #1, 59f\n"
        "str s21, [x9], #0x4\n"
        "str s23, [x26], #0x4\n"
        "tbz x11, #0, 64f\n"
        "st1 { v21.h }[2], [x9]\n"
        "st1 { v23.h }[2], [x26]\n"
        "b 64f\n"
        "59:"  // Height 2: Partial direct writeback: partial_1_8
        "tbz x11, #0, 64f\n"
        "str h21, [x9, #0x0]\n"
        "str h23, [x26, #0x0]\n"
        "b 64f\n"
        "60:"  // Height 2: Partial direct writeback: partial_4_0
        "tbz x11, #2, 62f\n"
        "str d20, [x9], #0x8\n"
        "str d22, [x26], #0x8\n"
        "tbz x11, #1, 61f\n"
        "st1 { v20.s }[2], [x9], #0x4\n"
        "st1 { v22.s }[2], [x26], #0x4\n"
        "tbz x11, #0, 64f\n"
        "st1 { v20.h }[6], [x9]\n"
        "st1 { v22.h }[6], [x26]\n"
        "b 64f\n"
        "61:"  // Height 2: Partial direct writeback: partial_1_4
        "tbz x11, #0, 64f\n"
        "st1 { v20.h }[4], [x9]\n"
        "st1 { v22.h }[4], [x26]\n"
        "b 64f\n"
        "62:"  // Height 2: Partial direct writeback: partial_2_0
        "tbz x11, #1, 63f\n"
        "str s20, [x9], #0x4\n"
        "str s22, [x26], #0x4\n"
        "tbz x11, #0, 64f\n"
        "st1 { v20.h }[2], [x9]\n"
        "st1 { v22.h }[2], [x26]\n"
        "b 64f\n"
        "63:"  // Height 2: Partial direct writeback: partial_1_0
        "str h20, [x9, #0x0]\n"
        "str h22, [x26, #0x0]\n"
        "64:"  // Height 2: Partial direct writeback: Done
        "b 66f\n"
        "65:"  // Height 2: Full writeback
        "str q20, [x9, #0x0]\n"
        "str q21, [x9, #0x10]\n"
        "add x9, x9, #0x20\n"
        "str q22, [x26, #0x0]\n"
        "str q23, [x26, #0x10]\n"
        "66:"  // Height 2: Writeback done
        "subs x11, x11, #0x10\n"
        "bgt 35b\n"
        "b 200f\n"
        "67:"  // Height 3
        "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
        "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
        "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
        "68:"  // Height 3: Column loop
        "cbz x10, 69f\n"
        "ldr q20, [x10, #0x0]\n"
        "ldr q21, [x10, #0x10]\n"
        "add x10, x10, #0x20\n"
        "mov v22.16b, v20.16b\n"
        "mov v23.16b, v21.16b\n"
        "mov v24.16b, v20.16b\n"
        "mov v25.16b, v21.16b\n"
        "b 80f\n"
        "69:"  // Height 3: no bias
        "tbz %x[flags], #0, 79f\n"
        "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
        "cmp x11, #0x10\n"
        "add x26, x9, x20, LSL #1\n"
        "add x25, x26, x20, LSL #1\n"
        "bge 78f\n"
        "tbz x11, #3, 73f\n"
        "ld1 { v20.8h }, [x9], #0x10\n"
        "ld1 { v22.8h }, [x26], #0x10\n"
        "ld1 { v24.8h }, [x25], #0x10\n"
        "tbz x11, #2, 71f\n"
        "ldr d21, [x9], #0x8\n"
        "ldr d23, [x26], #0x8\n"
        "ldr d25, [x25], #0x8\n"
        "tbz x11, #1, 70f\n"
        "ld1 { v21.s }[2], [x9], #0x4\n"
        "ld1 { v23.s }[2], [x26], #0x4\n"
        "mov x20, #0x1c\n"
        "ld1 { v25.s }[2], [x25], #0x4\n"
        "tbz x11, #0, 77f\n"
        "ld1 { v21.h }[6], [x9]\n"
        "ld1 { v23.h }[6], [x26]\n"
        "ld1 { v25.h }[6], [x25]\n"
        "b 77f\n"
        "70:"  // Height 3: Partial accumulate: partial_1_12
        "mov x20, #0x18\n"
        "tbz x11, #0, 77f\n"
        "ld1 { v21.h }[4], [x9]\n"
        "ld1 { v23.h }[4], [x26]\n"
        "ld1 { v25.h }[4], [x25]\n"
        "b 77f\n"
        "71:"  // Height 3: Partial accumulate: partial_2_8
        "tbz x11, #1, 72f\n"
        "ldr s21, [x9], #0x4\n"
        "ldr s23, [x26], #0x4\n"
        "mov x20, #0x14\n"
        "ldr s25, [x25], #0x4\n"
        "tbz x11, #0, 77f\n"
        "ld1 { v21.h }[2], [x9]\n"
        "ld1 { v23.h }[2], [x26]\n"
        "ld1 { v25.h }[2], [x25]\n"
        "b 77f\n"
        "72:"  // Height 3: Partial accumulate: partial_1_8
        "mov x20, #0x10\n"
        "tbz x11, #0, 77f\n"
        "ldr h21, [x9, #0x0]\n"
        "ldr h23, [x26, #0x0]\n"
        "ldr h25, [x25, #0x0]\n"
        "b 77f\n"
        "73:"  // Height 3: Partial accumulate: partial_4_0
        "tbz x11, #2, 75f\n"
        "ldr d20, [x9], #0x8\n"
        "ldr d22, [x26], #0x8\n"
        "ldr d24, [x25], #0x8\n"
        "tbz x11, #1, 74f\n"
        "ld1 { v20.s }[2], [x9], #0x4\n"
        "ld1 { v22.s }[2], [x26], #0x4\n"
        "mov x20, #0xc\n"
        "ld1 { v24.s }[2], [x25], #0x4\n"
        "tbz x11, #0, 77f\n"
        "ld1 { v20.h }[6], [x9]\n"
        "ld1 { v22.h }[6], [x26]\n"
        "ld1 { v24.h }[6], [x25]\n"
        "b 77f\n"
        "74:"  // Height 3: Partial accumulate: partial_1_4
        "mov x20, #0x8\n"
        "tbz x11, #0, 77f\n"
        "ld1 { v20.h }[4], [x9]\n"
        "ld1 { v22.h }[4], [x26]\n"
        "ld1 { v24.h }[4], [x25]\n"
        "b 77f\n"
        "75:"  // Height 3: Partial accumulate: partial_2_0
        "tbz x11, #1, 76f\n"
        "ldr s20, [x9], #0x4\n"
        "ldr s22, [x26], #0x4\n"
        "mov x20, #0x4\n"
        "ldr s24, [x25], #0x4\n"
        "tbz x11, #0, 77f\n"
        "ld1 { v20.h }[2], [x9]\n"
        "ld1 { v22.h }[2], [x26]\n"
        "ld1 { v24.h }[2], [x25]\n"
        "b 77f\n"
        "76:"  // Height 3: Partial accumulate: partial_1_0
        "ldr h20, [x9, #0x0]\n"
        "ldr h22, [x26, #0x0]\n"
        "mov x20, #0x0\n"
        "ldr h24, [x25, #0x0]\n"
        "77:"  // Height 3: Partial accumulate: Done
        "sub x9, x9, x20\n"
        "b 80f\n"
        "78:"  // Height 3: full accumulate
        "ldr q20, [x9, #0x0]\n"
        "ldr q21, [x9, #0x10]\n"
        "ldr q22, [x26, #0x0]\n"
        "ldr q23, [x26, #0x10]\n"
        "ldr q24, [x25, #0x0]\n"
        "ldr q25, [x25, #0x10]\n"
        "b 80f\n"
        "79:"  // Height 3: no accumulate
        "movi v20.16b, #0x0\n"
        "movi v21.16b, #0x0\n"
        "movi v22.16b, #0x0\n"
        "movi v23.16b, #0x0\n"
        "movi v24.16b, #0x0\n"
        "movi v25.16b, #0x0\n"
        "80:"  // Height 3: setup done
        "mov x28, #0x0\n"
        "81:"  // Height 3: String loop
        "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
        "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
        "ldr w27, [x20, x28, LSL #0x2]\n"
        "tbz %x[flags], #3, 82f\n"
        "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
        "add x20, x20, x21, LSL #3\n"
        "ldr x26, [x20, #0x0]\n"
        "ldr x25, [x20, #0x8]\n"
        "ldr x24, [x20, #0x10]\n"
        "cbnz x28, 83f\n"
        "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
        "add x26, x26, x20, LSL #1\n"
        "add x25, x25, x20, LSL #1\n"
        "add x24, x24, x20, LSL #1\n"
        "b 83f\n"
        "82:"  // Height 3: setup direct input
        "mov x26, %x[input_ptr]\n"
        "add x25, x26, x21, LSL #1\n"
        "add x24, x25, x21, LSL #1\n"
        "83:"  // Height 3: input setup done
        "cmp x27, #0x8\n"
        "blt 86f\n"
        "ldr q0, [x26, #0x0]\n"
        "ldr q1, [x25, #0x0]\n"
        "cmp x27, #0x10\n"
        "ldr q2, [x24, #0x0]\n"
        "ldr q6, [x10, #0x0]\n"
        "ldr q7, [x10, #0x10]\n"
        "ldr q8, [x10, #0x20]\n"
        "ldr q9, [x10, #0x30]\n"
        "ldr q10, [x10, #0x40]\n"
        "ldr q11, [x10, #0x50]\n"
        "ldr q12, [x10, #0x60]\n"
        "ldr q13, [x10, #0x70]\n"
        "ldr q14, [x10, #0x80]\n"
        "ldr q15, [x10, #0x90]\n"
        "ldr q16, [x10, #0xa0]\n"
        "ldr q17, [x10, #0xb0]\n"
        "ldr q18, [x10, #0xc0]\n"
        "ldr q19, [x10, #0xd0]\n"
        "blt 85f\n"
        "84:"  // Height 3: Multiply loop: Main loop head
        "fmla v20.8h, v6.8h, v0.h[0]\n"
        "fmla v22.8h, v6.8h, v1.h[0]\n"
        "sub x27, x27, #0x8\n"
        "add x26, x26, #0x10\n"
        "fmla v24.8h, v6.8h, v2.h[0]\n"
        "ldr q6, [x10, #0xe0]\n"
        "fmla v21.8h, v7.8h, v0.h[0]\n"
        "add x25, x25, #0x10\n"
        "fmla v23.8h, v7.8h, v1.h[0]\n"
        "fmla v25.8h, v7.8h, v2.h[0]\n"
        "ldr q7, [x10, #0xf0]\n"
        "add x24, x24, #0x10\n"
        "cmp x27, #0x10\n"
        "add x10, x10, #0x100\n"
        "prfm pldl1keep, [x26, #0x80]\n"
        "prfm pldl1keep, [x25, #0x80]\n"
        "fmla v20.8h, v8.8h, v0.h[1]\n"
        "fmla v22.8h, v8.8h, v1.h[1]\n"
        "prfm pldl1keep, [x24, #0x80]\n"
        "fmla v24.8h, v8.8h, v2.h[1]\n"
        "ldr q8, [x10, #0x20]\n"
        "fmla v21.8h, v9.8h, v0.h[1]\n"
        "fmla v23.8h, v9.8h, v1.h[1]\n"
        "fmla v25.8h, v9.8h, v2.h[1]\n"
        "ldr q9, [x10, #0x30]\n"
        "fmla v20.8h, v10.8h, v0.h[2]\n"
        "fmla v22.8h, v10.8h, v1.h[2]\n"
        "fmla v24.8h, v10.8h, v2.h[2]\n"
        "ldr q10, [x10, #0x40]\n"
        "fmla v21.8h, v11.8h, v0.h[2]\n"
        "fmla v23.8h, v11.8h, v1.h[2]\n"
        "fmla v25.8h, v11.8h, v2.h[2]\n"
        "ldr q11, [x10, #0x50]\n"
        "fmla v20.8h, v12.8h, v0.h[3]\n"
        "fmla v22.8h, v12.8h, v1.h[3]\n"
        "fmla v24.8h, v12.8h, v2.h[3]\n"
        "ldr q12, [x10, #0x60]\n"
        "fmla v21.8h, v13.8h, v0.h[3]\n"
        "fmla v23.8h, v13.8h, v1.h[3]\n"
        "fmla v25.8h, v13.8h, v2.h[3]\n"
        "ldr q13, [x10, #0x70]\n"
        "fmla v20.8h, v14.8h, v0.h[4]\n"
        "fmla v22.8h, v14.8h, v1.h[4]\n"
        "fmla v24.8h, v14.8h, v2.h[4]\n"
        "ldr q14, [x10, #0x80]\n"
        "fmla v21.8h, v15.8h, v0.h[4]\n"
        "fmla v23.8h, v15.8h, v1.h[4]\n"
        "fmla v25.8h, v15.8h, v2.h[4]\n"
        "ldr q15, [x10, #0x90]\n"
        "fmla v20.8h, v16.8h, v0.h[5]\n"
        "fmla v22.8h, v16.8h, v1.h[5]\n"
        "fmla v24.8h, v16.8h, v2.h[5]\n"
        "ldr q16, [x10, #0xa0]\n"
        "fmla v21.8h, v17.8h, v0.h[5]\n"
        "fmla v23.8h, v17.8h, v1.h[5]\n"
        "fmla v25.8h, v17.8h, v2.h[5]\n"
        "ldr q17, [x10, #0xb0]\n"
        "fmla v20.8h, v18.8h, v0.h[6]\n"
        "fmla v22.8h, v18.8h, v1.h[6]\n"
        "fmla v24.8h, v18.8h, v2.h[6]\n"
        "ldr q18, [x10, #0xc0]\n"
        "fmla v21.8h, v19.8h, v0.h[6]\n"
        "fmla v23.8h, v19.8h, v1.h[6]\n"
        "fmla v25.8h, v19.8h, v2.h[6]\n"
        "ldr q19, [x10, #0xd0]\n"
        "fmla v20.8h, v6.8h, v0.h[7]\n"
        "fmla v22.8h, v6.8h, v1.h[7]\n"
        "fmla v24.8h, v6.8h, v2.h[7]\n"
        "ldr q6, [x10, #0x0]\n"
        "fmla v21.8h, v7.8h, v0.h[7]\n"
        "ldr q0, [x26, #0x0]\n"
        "fmla v23.8h, v7.8h, v1.h[7]\n"
        "ldr q1, [x25, #0x0]\n"
        "fmla v25.8h, v7.8h, v2.h[7]\n"
        "ldr q2, [x24, #0x0]\n"
        "ldr q7, [x10, #0x10]\n"
        "bge 84b\n"
        "85:"  // Height 3: Multiply loop: Single iteration only
        "fmla v20.8h, v6.8h, v0.h[0]\n"
        "fmla v22.8h, v6.8h, v1.h[0]\n"
        "add x26, x26, #0x10\n"
        "add x25, x25, #0x10\n"
        "fmla v24.8h, v6.8h, v2.h[0]\n"
        "ldr q6, [x10, #0xe0]\n"
        "fmla v21.8h, v7.8h, v0.h[0]\n"
        "add x24, x24, #0x10\n"
        "fmla v23.8h, v7.8h, v1.h[0]\n"
        "fmla v25.8h, v7.8h, v2.h[0]\n"
        "ldr q7, [x10, #0xf0]\n"
        "prfm pldl1keep, [x26, #0x80]\n"
        "sub x27, x27, #0x8\n"
        "prfm pldl1keep, [x25, #0x80]\n"
        "add x10, x10, #0x100\n"
        "prfm pldl1keep, [x24, #0x80]\n"
        "fmla v20.8h, v8.8h, v0.h[1]\n"
        "fmla v22.8h, v8.8h, v1.h[1]\n"
        "fmla v24.8h, v8.8h, v2.h[1]\n"
        "fmla v21.8h, v9.8h, v0.h[1]\n"
        "fmla v23.8h, v9.8h, v1.h[1]\n"
        "fmla v25.8h, v9.8h, v2.h[1]\n"
        "fmla v20.8h, v10.8h, v0.h[2]\n"
        "fmla v22.8h, v10.8h, v1.h[2]\n"
        "fmla v24.8h, v10.8h, v2.h[2]\n"
        "fmla v21.8h, v11.8h, v0.h[2]\n"
        "fmla v23.8h, v11.8h, v1.h[2]\n"
        "fmla v25.8h, v11.8h, v2.h[2]\n"
        "fmla v20.8h, v12.8h, v0.h[3]\n"
        "fmla v22.8h, v12.8h, v1.h[3]\n"
        "fmla v24.8h, v12.8h, v2.h[3]\n"
        "fmla v21.8h, v13.8h, v0.h[3]\n"
        "fmla v23.8h, v13.8h, v1.h[3]\n"
        "fmla v25.8h, v13.8h, v2.h[3]\n"
        "fmla v20.8h, v14.8h, v0.h[4]\n"
        "fmla v22.8h, v14.8h, v1.h[4]\n"
        "fmla v24.8h, v14.8h, v2.h[4]\n"
        "fmla v21.8h, v15.8h, v0.h[4]\n"
        "fmla v23.8h, v15.8h, v1.h[4]\n"
        "fmla v25.8h, v15.8h, v2.h[4]\n"
        "fmla v20.8h, v16.8h, v0.h[5]\n"
        "fmla v22.8h, v16.8h, v1.h[5]\n"
        "fmla v24.8h, v16.8h, v2.h[5]\n"
        "fmla v21.8h, v17.8h, v0.h[5]\n"
        "fmla v23.8h, v17.8h, v1.h[5]\n"
        "fmla v25.8h, v17.8h, v2.h[5]\n"
        "fmla v20.8h, v18.8h, v0.h[6]\n"
        "fmla v22.8h, v18.8h, v1.h[6]\n"
        "fmla v24.8h, v18.8h, v2.h[6]\n"
        "fmla v21.8h, v19.8h, v0.h[6]\n"
        "fmla v23.8h, v19.8h, v1.h[6]\n"
        "fmla v25.8h, v19.8h, v2.h[6]\n"
        "fmla v20.8h, v6.8h, v0.h[7]\n"
        "fmla v22.8h, v6.8h, v1.h[7]\n"
        "fmla v24.8h, v6.8h, v2.h[7]\n"
        "fmla v21.8h, v7.8h, v0.h[7]\n"
        "fmla v23.8h, v7.8h, v1.h[7]\n"
        "fmla v25.8h, v7.8h, v2.h[7]\n"
        "86:"  // Height 3: Multiply loop: Main loop skip
        "cbz x27, 88f\n"
        "87:"  // Height 3: Multiply loop: Odd block loop
        "ldr h0, [x26], #0x2\n"
        "ldr h1, [x25], #0x2\n"
        "sub x27, x27, #0x1\n"
        "ldr h2, [x24], #0x2\n"
        "ldr q8, [x10, #0x0]\n"
        "ldr q9, [x10, #0x10]\n"
        "add x10, x10, #0x20\n"
        "fmla v20.8h, v8.8h, v0.h[0]\n"
        "fmla v22.8h, v8.8h, v1.h[0]\n"
        "fmla v24.8h, v8.8h, v2.h[0]\n"
        "fmla v21.8h, v9.8h, v0.h[0]\n"
        "fmla v23.8h, v9.8h, v1.h[0]\n"
        "fmla v25.8h, v9.8h, v2.h[0]\n"
        "cbnz x27, 87b\n"
        "88:"  // Height 3: Multiply loop: No odd multiplies
        "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
        "add x28, x28, #0x1\n"
        "cmp x28, x20\n"
        "bne 81b\n"
        "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
        "prfm pstl1keep, [x9, #0x0]\n"
        "add x26, x9, x20, LSL #1\n"
        "prfm pstl1keep, [x26, #0x0]\n"
        "add x25, x26, x20, LSL #1\n"
        "prfm pstl1keep, [x25, #0x0]\n"
        "tbz %x[flags], #1, 89f\n"
        "add x21, %x[args_ptr], %[offset_max]\n"
        "add x20, %x[args_ptr], %[offset_min]\n"
        "ld1r { v17.8h }, [x21]\n"
        "ld1r { v16.8h }, [x20]\n"
        "fmin v20.8h, v20.8h, v17.8h\n"
        "fmin v21.8h, v21.8h, v17.8h\n"
        "fmin v22.8h, v22.8h, v17.8h\n"
        "fmin v23.8h, v23.8h, v17.8h\n"
        "fmin v24.8h, v24.8h, v17.8h\n"
        "fmin v25.8h, v25.8h, v17.8h\n"
        "fmax v20.8h, v20.8h, v16.8h\n"
        "fmax v21.8h, v21.8h, v16.8h\n"
        "fmax v22.8h, v22.8h, v16.8h\n"
        "fmax v23.8h, v23.8h, v16.8h\n"
        "fmax v24.8h, v24.8h, v16.8h\n"
        "fmax v25.8h, v25.8h, v16.8h\n"
        "89:"  // Height 3: No activation
        "cmp x11, #0x10\n"
        "bge 98f\n"
        "tbz x11, #3, 93f\n"
        "st1 { v20.8h }, [x9], #0x10\n"
        "st1 { v22.8h }, [x26], #0x10\n"
        "st1 { v24.8h }, [x25], #0x10\n"
        "tbz x11, #2, 91f\n"
        "str d21, [x9], #0x8\n"
        "str d23, [x26], #0x8\n"
        "str d25, [x25], #0x8\n"
        "tbz x11, #1, 90f\n"
        "st1 { v21.s }[2], [x9], #0x4\n"
        "st1 { v23.s }[2], [x26], #0x4\n"
        "st1 { v25.s }[2], [x25], #0x4\n"
        "tbz x11, #0, 97f\n"
        "st1 { v21.h }[6], [x9]\n"
        "st1 { v23.h }[6], [x26]\n"
        "st1 { v25.h }[6], [x25]\n"
        "b 97f\n"
        "90:"  // Height 3: Partial direct writeback: partial_1_12
        "tbz x11, #0, 97f\n"
        "st1 { v21.h }[4], [x9]\n"
        "st1 { v23.h }[4], [x26]\n"
        "st1 { v25.h }[4], [x25]\n"
        "b 97f\n"
        "91:"  // Height 3: Partial direct writeback: partial_2_8
        "tbz x11, #1, 92f\n"
        "str s21, [x9], #0x4\n"
        "str s23, [x26], #0x4\n"
        "str s25, [x25], #0x4\n"
        "tbz x11, #0, 97f\n"
        "st1 { v21.h }[2], [x9]\n"
        "st1 { v23.h }[2], [x26]\n"
        "st1 { v25.h }[2], [x25]\n"
        "b 97f\n"
        "92:"  // Height 3: Partial direct writeback: partial_1_8
        "tbz x11, #0, 97f\n"
        "str h21, [x9, #0x0]\n"
        "str h23, [x26, #0x0]\n"
        "str h25, [x25, #0x0]\n"
        "b 97f\n"
        "93:"  // Height 3: Partial direct writeback: partial_4_0
        "tbz x11, #2, 95f\n"
        "str d20, [x9], #0x8\n"
        "str d22, [x26], #0x8\n"
        "str d24, [x25], #0x8\n"
        "tbz x11, #1, 94f\n"
        "st1 { v20.s }[2], [x9], #0x4\n"
        "st1 { v22.s }[2], [x26], #0x4\n"
        "st1 { v24.s }[2], [x25], #0x4\n"
        "tbz x11, #0, 97f\n"
        "st1 { v20.h }[6], [x9]\n"
        "st1 { v22.h }[6], [x26]\n"
        "st1 { v24.h }[6], [x25]\n"
        "b 97f\n"
        "94:"  // Height 3: Partial direct writeback: partial_1_4
        "tbz x11, #0, 97f\n"
        "st1 { v20.h }[4], [x9]\n"
        "st1 { v22.h }[4], [x26]\n"
        "st1 { v24.h }[4], [x25]\n"
        "b 97f\n"
        "95:"  // Height 3: Partial direct writeback: partial_2_0
        "tbz x11, #1, 96f\n"
        "str s20, [x9], #0x4\n"
        "str s22, [x26], #0x4\n"
        "str s24, [x25], #0x4\n"
        "tbz x11, #0, 97f\n"
        "st1 { v20.h }[2], [x9]\n"
        "st1 { v22.h }[2], [x26]\n"
        "st1 { v24.h }[2], [x25]\n"
        "b 97f\n"
        "96:"  // Height 3: Partial direct writeback: partial_1_0
        "str h20, [x9, #0x0]\n"
        "str h22, [x26, #0x0]\n"
        "str h24, [x25, #0x0]\n"
        "97:"  // Height 3: Partial direct writeback: Done
        "b 99f\n"
        "98:"  // Height 3: Full writeback
        "str q20, [x9, #0x0]\n"
        "str q21, [x9, #0x10]\n"
        "add x9, x9, #0x20\n"
        "str q22, [x26, #0x0]\n"
        "str q23, [x26, #0x10]\n"
        "str q24, [x25, #0x0]\n"
        "str q25, [x25, #0x10]\n"
        "99:"  // Height 3: Writeback done
        "subs x11, x11, #0x10\n"
        "bgt 68b\n"
        "b 200f\n"
        "100:"  // Height 4
        "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
        "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
        "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
        "101:"  // Height 4: Column loop
        "cbz x10, 102f\n"
        "ldr q20, [x10, #0x0]\n"
        "ldr q21, [x10, #0x10]\n"
        "add x10, x10, #0x20\n"
        "mov v22.16b, v20.16b\n"
        "mov v23.16b, v21.16b\n"
        "mov v24.16b, v20.16b\n"
        "mov v25.16b, v21.16b\n"
        "mov v26.16b, v20.16b\n"
        "mov v27.16b, v21.16b\n"
        "b 113f\n"
        "102:"  // Height 4: no bias
        "tbz %x[flags], #0, 112f\n"
        "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
        "cmp x11, #0x10\n"
        "add x26, x9, x20, LSL #1\n"
        "add x25, x26, x20, LSL #1\n"
        "add x24, x25, x20, LSL #1\n"
        "bge 111f\n"
        "tbz x11, #3, 106f\n"
        "ld1 { v20.8h }, [x9], #0x10\n"
        "ld1 { v22.8h }, [x26], #0x10\n"
        "ld1 { v24.8h }, [x25], #0x10\n"
        "ld1 { v26.8h }, [x24], #0x10\n"
        "tbz x11, #2, 104f\n"
        "ldr d21, [x9], #0x8\n"
        "ldr d23, [x26], #0x8\n"
        "ldr d25, [x25], #0x8\n"
        "ldr d27, [x24], #0x8\n"
        "tbz x11, #1, 103f\n"
        "ld1 { v21.s }[2], [x9], #0x4\n"
        "ld1 { v23.s }[2], [x26], #0x4\n"
        "mov x20, #0x1c\n"
        "ld1 { v25.s }[2], [x25], #0x4\n"
        "ld1 { v27.s }[2], [x24], #0x4\n"
        "tbz x11, #0, 110f\n"
        "ld1 { v21.h }[6], [x9]\n"
        "ld1 { v23.h }[6], [x26]\n"
        "ld1 { v25.h }[6], [x25]\n"
        "ld1 { v27.h }[6], [x24]\n"
        "b 110f\n"
        "103:"  // Height 4: Partial accumulate: partial_1_12
        "mov x20, #0x18\n"
        "tbz x11, #0, 110f\n"
        "ld1 { v21.h }[4], [x9]\n"
        "ld1 { v23.h }[4], [x26]\n"
        "ld1 { v25.h }[4], [x25]\n"
        "ld1 { v27.h }[4], [x24]\n"
        "b 110f\n"
        "104:"  // Height 4: Partial accumulate: partial_2_8
        "tbz x11, #1, 105f\n"
        "ldr s21, [x9], #0x4\n"
        "ldr s23, [x26], #0x4\n"
        "mov x20, #0x14\n"
        "ldr s25, [x25], #0x4\n"
        "ldr s27, [x24], #0x4\n"
        "tbz x11, #0, 110f\n"
        "ld1 { v21.h }[2], [x9]\n"
        "ld1 { v23.h }[2], [x26]\n"
        "ld1 { v25.h }[2], [x25]\n"
        "ld1 { v27.h }[2], [x24]\n"
        "b 110f\n"
        "105:"  // Height 4: Partial accumulate: partial_1_8
        "mov x20, #0x10\n"
        "tbz x11, #0, 110f\n"
        "ldr h21, [x9, #0x0]\n"
        "ldr h23, [x26, #0x0]\n"
        "ldr h25, [x25, #0x0]\n"
        "ldr h27, [x24, #0x0]\n"
        "b 110f\n"
        "106:"  // Height 4: Partial accumulate: partial_4_0
        "tbz x11, #2, 108f\n"
        "ldr d20, [x9], #0x8\n"
        "ldr d22, [x26], #0x8\n"
        "ldr d24, [x25], #0x8\n"
        "ldr d26, [x24], #0x8\n"
        "tbz x11, #1, 107f\n"
        "ld1 { v20.s }[2], [x9], #0x4\n"
        "ld1 { v22.s }[2], [x26], #0x4\n"
        "mov x20, #0xc\n"
        "ld1 { v24.s }[2], [x25], #0x4\n"
        "ld1 { v26.s }[2], [x24], #0x4\n"
        "tbz x11, #0, 110f\n"
        "ld1 { v20.h }[6], [x9]\n"
        "ld1 { v22.h }[6], [x26]\n"
        "ld1 { v24.h }[6], [x25]\n"
        "ld1 { v26.h }[6], [x24]\n"
        "b 110f\n"
        "107:"  // Height 4: Partial accumulate: partial_1_4
        "mov x20, #0x8\n"
        "tbz x11, #0, 110f\n"
        "ld1 { v20.h }[4], [x9]\n"
        "ld1 { v22.h }[4], [x26]\n"
        "ld1 { v24.h }[4], [x25]\n"
        "ld1 { v26.h }[4], [x24]\n"
        "b 110f\n"
        "108:"  // Height 4: Partial accumulate: partial_2_0
        "tbz x11, #1, 109f\n"
        "ldr s20, [x9], #0x4\n"
        "ldr s22, [x26], #0x4\n"
        "mov x20, #0x4\n"
        "ldr s24, [x25], #0x4\n"
        "ldr s26, [x24], #0x4\n"
        "tbz x11, #0, 110f\n"
        "ld1 { v20.h }[2], [x9]\n"
        "ld1 { v22.h }[2], [x26]\n"
        "ld1 { v24.h }[2], [x25]\n"
        "ld1 { v26.h }[2], [x24]\n"
        "b 110f\n"
        "109:"  // Height 4: Partial accumulate: partial_1_0
        "ldr h20, [x9, #0x0]\n"
        "ldr h22, [x26, #0x0]\n"
        "mov x20, #0x0\n"
        "ldr h24, [x25, #0x0]\n"
        "ldr h26, [x24, #0x0]\n"
        "110:"  // Height 4: Partial accumulate: Done
        "sub x9, x9, x20\n"
        "b 113f\n"
        "111:"  // Height 4: full accumulate
        "ldr q20, [x9, #0x0]\n"
        "ldr q21, [x9, #0x10]\n"
        "ldr q22, [x26, #0x0]\n"
        "ldr q23, [x26, #0x10]\n"
        "ldr q24, [x25, #0x0]\n"
        "ldr q25, [x25, #0x10]\n"
        "ldr q26, [x24, #0x0]\n"
        "ldr q27, [x24, #0x10]\n"
        "b 113f\n"
        "112:"  // Height 4: no accumulate
        "movi v20.16b, #0x0\n"
        "movi v21.16b, #0x0\n"
        "movi v22.16b, #0x0\n"
        "movi v23.16b, #0x0\n"
        "movi v24.16b, #0x0\n"
        "movi v25.16b, #0x0\n"
        "movi v26.16b, #0x0\n"
        "movi v27.16b, #0x0\n"
        "113:"  // Height 4: setup done
        "mov x28, #0x0\n"
        "114:"  // Height 4: String loop
        "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
        "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
        "ldr w27, [x20, x28, LSL #0x2]\n"
        "tbz %x[flags], #3, 115f\n"
        "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
        "add x20, x20, x21, LSL #3\n"
        "ldr x26, [x20, #0x0]\n"
        "ldr x25, [x20, #0x8]\n"
        "ldr x24, [x20, #0x10]\n"
        "ldr x23, [x20, #0x18]\n"
        "cbnz x28, 116f\n"
        "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
        "add x26, x26, x20, LSL #1\n"
        "add x25, x25, x20, LSL #1\n"
        "add x24, x24, x20, LSL #1\n"
        "add x23, x23, x20, LSL #1\n"
        "b 116f\n"
        "115:"  // Height 4: setup direct input
        "mov x26, %x[input_ptr]\n"
        "add x25, x26, x21, LSL #1\n"
        "add x24, x25, x21, LSL #1\n"
        "add x23, x24, x21, LSL #1\n"
        "116:"  // Height 4: input setup done
        "cmp x27, #0x8\n"
        "blt 119f\n"
        "ldr q0, [x26, #0x0]\n"
        "ldr q1, [x25, #0x0]\n"
        "cmp x27, #0x10\n"
        "ldr q2, [x24, #0x0]\n"
        "ldr q3, [x23, #0x0]\n"
        "ldr q6, [x10, #0x0]\n"
        "ldr q7, [x10, #0x10]\n"
        "ldr q8, [x10, #0x20]\n"
        "ldr q9, [x10, #0x30]\n"
        "ldr q10, [x10, #0x40]\n"
        "ldr q11, [x10, #0x50]\n"
        "ldr q12, [x10, #0x60]\n"
        "ldr q13, [x10, #0x70]\n"
        "ldr q14, [x10, #0x80]\n"
        "ldr q15, [x10, #0x90]\n"
        "ldr q16, [x10, #0xa0]\n"
        "ldr q17, [x10, #0xb0]\n"
        "ldr q18, [x10, #0xc0]\n"
        "ldr q19, [x10, #0xd0]\n"
        "blt 118f\n"
        "117:"  // Height 4: Multiply loop: Main loop head
        "fmla v20.8h, v6.8h, v0.h[0]\n"
        "fmla v22.8h, v6.8h, v1.h[0]\n"
        "sub x27, x27, #0x8\n"
        "add x26, x26, #0x10\n"
        "fmla v24.8h, v6.8h, v2.h[0]\n"
        "fmla v26.8h, v6.8h, v3.h[0]\n"
        "ldr q6, [x10, #0xe0]\n"
        "add x25, x25, #0x10\n"
        "fmla v21.8h, v7.8h, v0.h[0]\n"
        "fmla v23.8h, v7.8h, v1.h[0]\n"
        "add x24, x24, #0x10\n"
        "add x23, x23, #0x10\n"
        "fmla v25.8h, v7.8h, v2.h[0]\n"
        "fmla v27.8h, v7.8h, v3.h[0]\n"
        "ldr q7, [x10, #0xf0]\n"
        "cmp x27, #0x10\n"
        "fmla v20.8h, v8.8h, v0.h[1]\n"
        "fmla v22.8h, v8.8h, v1.h[1]\n"
        "add x10, x10, #0x100\n"
        "prfm pldl1keep, [x26, #0x80]\n"
        "fmla v24.8h, v8.8h, v2.h[1]\n"
        "fmla v26.8h, v8.8h, v3.h[1]\n"
        "ldr q8, [x10, #0x20]\n"
        "prfm pldl1keep, [x25, #0x80]\n"
        "fmla v21.8h, v9.8h, v0.h[1]\n"
        "fmla v23.8h, v9.8h, v1.h[1]\n"
        "prfm pldl1keep, [x24, #0x80]\n"
        "prfm pldl1keep, [x23, #0x80]\n"
        "fmla v25.8h, v9.8h, v2.h[1]\n"
        "fmla v27.8h, v9.8h, v3.h[1]\n"
        "ldr q9, [x10, #0x30]\n"
        "fmla v20.8h, v10.8h, v0.h[2]\n"
        "fmla v22.8h, v10.8h, v1.h[2]\n"
        "fmla v24.8h, v10.8h, v2.h[2]\n"
        "fmla v26.8h, v10.8h, v3.h[2]\n"
        "ldr q10, [x10, #0x40]\n"
        "fmla v21.8h, v11.8h, v0.h[2]\n"
        "fmla v23.8h, v11.8h, v1.h[2]\n"
        "fmla v25.8h, v11.8h, v2.h[2]\n"
        "fmla v27.8h, v11.8h, v3.h[2]\n"
        "ldr q11, [x10, #0x50]\n"
        "fmla v20.8h, v12.8h, v0.h[3]\n"
        "fmla v22.8h, v12.8h, v1.h[3]\n"
        "fmla v24.8h, v12.8h, v2.h[3]\n"
        "fmla v26.8h, v12.8h, v3.h[3]\n"
        "ldr q12, [x10, #0x60]\n"
        "fmla v21.8h, v13.8h, v0.h[3]\n"
        "fmla v23.8h, v13.8h, v1.h[3]\n"
        "fmla v25.8h, v13.8h, v2.h[3]\n"
        "fmla v27.8h, v13.8h, v3.h[3]\n"
        "ldr q13, [x10, #0x70]\n"
        "fmla v20.8h, v14.8h, v0.h[4]\n"
        "fmla v22.8h, v14.8h, v1.h[4]\n"
        "fmla v24.8h, v14.8h, v2.h[4]\n"
        "fmla v26.8h, v14.8h, v3.h[4]\n"
        "ldr q14, [x10, #0x80]\n"
        "fmla v21.8h, v15.8h, v0.h[4]\n"
        "fmla v23.8h, v15.8h, v1.h[4]\n"
        "fmla v25.8h, v15.8h, v2.h[4]\n"
        "fmla v27.8h, v15.8h, v3.h[4]\n"
        "ldr q15, [x10, #0x90]\n"
        "fmla v20.8h, v16.8h, v0.h[5]\n"
        "fmla v22.8h, v16.8h, v1.h[5]\n"
        "fmla v24.8h, v16.8h, v2.h[5]\n"
        "fmla v26.8h, v16.8h, v3.h[5]\n"
        "ldr q16, [x10, #0xa0]\n"
        "fmla v21.8h, v17.8h, v0.h[5]\n"
        "fmla v23.8h, v17.8h, v1.h[5]\n"
        "fmla v25.8h, v17.8h, v2.h[5]\n"
        "fmla v27.8h, v17.8h, v3.h[5]\n"
        "ldr q17, [x10, #0xb0]\n"
        "fmla v20.8h, v18.8h, v0.h[6]\n"
        "fmla v22.8h, v18.8h, v1.h[6]\n"
        "fmla v24.8h, v18.8h, v2.h[6]\n"
        "fmla v26.8h, v18.8h, v3.h[6]\n"
        "ldr q18, [x10, #0xc0]\n"
        "fmla v21.8h, v19.8h, v0.h[6]\n"
        "fmla v23.8h, v19.8h, v1.h[6]\n"
        "fmla v25.8h, v19.8h, v2.h[6]\n"
        "fmla v27.8h, v19.8h, v3.h[6]\n"
        "ldr q19, [x10, #0xd0]\n"
        "fmla v20.8h, v6.8h, v0.h[7]\n"
        "fmla v22.8h, v6.8h, v1.h[7]\n"
        "fmla v24.8h, v6.8h, v2.h[7]\n"
        "fmla v26.8h, v6.8h, v3.h[7]\n"
        "ldr q6, [x10, #0x0]\n"
        "fmla v21.8h, v7.8h, v0.h[7]\n"
        "ldr q0, [x26, #0x0]\n"
        "fmla v23.8h, v7.8h, v1.h[7]\n"
        "ldr q1, [x25, #0x0]\n"
        "fmla v25.8h, v7.8h, v2.h[7]\n"
        "ldr q2, [x24, #0x0]\n"
        "fmla v27.8h, v7.8h, v3.h[7]\n"
        "ldr q3, [x23, #0x0]\n"
        "ldr q7, [x10, #0x10]\n"
        "bge 117b\n"
        "118:"  // Height 4: Multiply loop: Single iteration only
        "fmla v20.8h, v6.8h, v0.h[0]\n"
        "fmla v22.8h, v6.8h, v1.h[0]\n"
        "add x26, x26, #0x10\n"
        "add x25, x25, #0x10\n"
        "fmla v24.8h, v6.8h, v2.h[0]\n"
        "fmla v26.8h, v6.8h, v3.h[0]\n"
        "ldr q6, [x10, #0xe0]\n"
        "add x24, x24, #0x10\n"
        "fmla v21.8h, v7.8h, v0.h[0]\n"
        "fmla v23.8h, v7.8h, v1.h[0]\n"
        "add x23, x23, #0x10\n"
        "sub x27, x27, #0x8\n"
        "fmla v25.8h, v7.8h, v2.h[0]\n"
        "fmla v27.8h, v7.8h, v3.h[0]\n"
        "ldr q7, [x10, #0xf0]\n"
        "prfm pldl1keep, [x26, #0x80]\n"
        "fmla v20.8h, v8.8h, v0.h[1]\n"
        "fmla v22.8h, v8.8h, v1.h[1]\n"
        "prfm pldl1keep, [x25, #0x80]\n"
        "prfm pldl1keep, [x24, #0x80]\n"
        "fmla v24.8h, v8.8h, v2.h[1]\n"
        "fmla v26.8h, v8.8h, v3.h[1]\n"
        "prfm pldl1keep, [x23, #0x80]\n"
        "add x10, x10, #0x100\n"
        "fmla v21.8h, v9.8h, v0.h[1]\n"
        "fmla v23.8h, v9.8h, v1.h[1]\n"
        "fmla v25.8h, v9.8h, v2.h[1]\n"
        "fmla v27.8h, v9.8h, v3.h[1]\n"
        "fmla v20.8h, v10.8h, v0.h[2]\n"
        "fmla v22.8h, v10.8h, v1.h[2]\n"
        "fmla v24.8h, v10.8h, v2.h[2]\n"
        "fmla v26.8h, v10.8h, v3.h[2]\n"
        "fmla v21.8h, v11.8h, v0.h[2]\n"
        "fmla v23.8h, v11.8h, v1.h[2]\n"
        "fmla v25.8h, v11.8h, v2.h[2]\n"
        "fmla v27.8h, v11.8h, v3.h[2]\n"
        "fmla v20.8h, v12.8h, v0.h[3]\n"
        "fmla v22.8h, v12.8h, v1.h[3]\n"
        "fmla v24.8h, v12.8h, v2.h[3]\n"
        "fmla v26.8h, v12.8h, v3.h[3]\n"
        "fmla v21.8h, v13.8h, v0.h[3]\n"
        "fmla v23.8h, v13.8h, v1.h[3]\n"
        "fmla v25.8h, v13.8h, v2.h[3]\n"
        "fmla v27.8h, v13.8h, v3.h[3]\n"
        "fmla v20.8h, v14.8h, v0.h[4]\n"
        "fmla v22.8h, v14.8h, v1.h[4]\n"
        "fmla v24.8h, v14.8h, v2.h[4]\n"
        "fmla v26.8h, v14.8h, v3.h[4]\n"
        "fmla v21.8h, v15.8h, v0.h[4]\n"
        "fmla v23.8h, v15.8h, v1.h[4]\n"
        "fmla v25.8h, v15.8h, v2.h[4]\n"
        "fmla v27.8h, v15.8h, v3.h[4]\n"
        "fmla v20.8h, v16.8h, v0.h[5]\n"
        "fmla v22.8h, v16.8h, v1.h[5]\n"
        "fmla v24.8h, v16.8h, v2.h[5]\n"
        "fmla v26.8h, v16.8h, v3.h[5]\n"
        "fmla v21.8h, v17.8h, v0.h[5]\n"
        "fmla v23.8h, v17.8h, v1.h[5]\n"
        "fmla v25.8h, v17.8h, v2.h[5]\n"
        "fmla v27.8h, v17.8h, v3.h[5]\n"
        "fmla v20.8h, v18.8h, v0.h[6]\n"
        "fmla v22.8h, v18.8h, v1.h[6]\n"
        "fmla v24.8h, v18.8h, v2.h[6]\n"
        "fmla v26.8h, v18.8h, v3.h[6]\n"
        "fmla v21.8h, v19.8h, v0.h[6]\n"
        "fmla v23.8h, v19.8h, v1.h[6]\n"
        "fmla v25.8h, v19.8h, v2.h[6]\n"
        "fmla v27.8h, v19.8h, v3.h[6]\n"
        "fmla v20.8h, v6.8h, v0.h[7]\n"
        "fmla v22.8h, v6.8h, v1.h[7]\n"
        "fmla v24.8h, v6.8h, v2.h[7]\n"
        "fmla v26.8h, v6.8h, v3.h[7]\n"
        "fmla v21.8h, v7.8h, v0.h[7]\n"
        "fmla v23.8h, v7.8h, v1.h[7]\n"
        "fmla v25.8h, v7.8h, v2.h[7]\n"
        "fmla v27.8h, v7.8h, v3.h[7]\n"
        "119:"  // Height 4: Multiply loop: Main loop skip
        "cbz x27, 121f\n"
        "120:"  // Height 4: Multiply loop: Odd block loop
        "ldr h0, [x26], #0x2\n"
        "ldr h1, [x25], #0x2\n"
        "sub x27, x27, #0x1\n"
        "ldr h2, [x24], #0x2\n"
        "ldr h3, [x23], #0x2\n"
        "ldr q8, [x10, #0x0]\n"
        "ldr q9, [x10, #0x10]\n"
        "add x10, x10, #0x20\n"
        "fmla v20.8h, v8.8h, v0.h[0]\n"
        "fmla v22.8h, v8.8h, v1.h[0]\n"
        "fmla v24.8h, v8.8h, v2.h[0]\n"
        "fmla v26.8h, v8.8h, v3.h[0]\n"
        "fmla v21.8h, v9.8h, v0.h[0]\n"
        "fmla v23.8h, v9.8h, v1.h[0]\n"
        "fmla v25.8h, v9.8h, v2.h[0]\n"
        "fmla v27.8h, v9.8h, v3.h[0]\n"
        "cbnz x27, 120b\n"
        "121:"  // Height 4: Multiply loop: No odd multiplies
        "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
        "add x28, x28, #0x1\n"
        "cmp x28, x20\n"
        "bne 114b\n"
        "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
        "prfm pstl1keep, [x9, #0x0]\n"
        "add x26, x9, x20, LSL #1\n"
        "prfm pstl1keep, [x26, #0x0]\n"
        "add x25, x26, x20, LSL #1\n"
        "prfm pstl1keep, [x25, #0x0]\n"
        "add x24, x25, x20, LSL #1\n"
        "prfm pstl1keep, [x24, #0x0]\n"
        "tbz %x[flags], #1, 122f\n"
        "add x21, %x[args_ptr], %[offset_max]\n"
        "add x20, %x[args_ptr], %[offset_min]\n"
        "ld1r { v17.8h }, [x21]\n"
        "ld1r { v16.8h }, [x20]\n"
        "fmin v20.8h, v20.8h, v17.8h\n"
        "fmin v21.8h, v21.8h, v17.8h\n"
        "fmin v22.8h, v22.8h, v17.8h\n"
        "fmin v23.8h, v23.8h, v17.8h\n"
        "fmin v24.8h, v24.8h, v17.8h\n"
        "fmin v25.8h, v25.8h, v17.8h\n"
        "fmin v26.8h, v26.8h, v17.8h\n"
        "fmin v27.8h, v27.8h, v17.8h\n"
        "fmax v20.8h, v20.8h, v16.8h\n"
        "fmax v21.8h, v21.8h, v16.8h\n"
        "fmax v22.8h, v22.8h, v16.8h\n"
        "fmax v23.8h, v23.8h, v16.8h\n"
        "fmax v24.8h, v24.8h, v16.8h\n"
        "fmax v25.8h, v25.8h, v16.8h\n"
        "fmax v26.8h, v26.8h, v16.8h\n"
        "fmax v27.8h, v27.8h, v16.8h\n"
        "122:"  // Height 4: No activation
        "cmp x11, #0x10\n"
        "bge 131f\n"
        "tbz x11, #3, 126f\n"
        "st1 { v20.8h }, [x9], #0x10\n"
        "st1 { v22.8h }, [x26], #0x10\n"
        "st1 { v24.8h }, [x25], #0x10\n"
        "st1 { v26.8h }, [x24], #0x10\n"
        "tbz x11, #2, 124f\n"
        "str d21, [x9], #0x8\n"
        "str d23, [x26], #0x8\n"
        "str d25, [x25], #0x8\n"
        "str d27, [x24], #0x8\n"
        "tbz x11, #1, 123f\n"
        "st1 { v21.s }[2], [x9], #0x4\n"
        "st1 { v23.s }[2], [x26], #0x4\n"
        "st1 { v25.s }[2], [x25], #0x4\n"
        "st1 { v27.s }[2], [x24], #0x4\n"
        "tbz x11, #0, 130f\n"
        "st1 { v21.h }[6], [x9]\n"
        "st1 { v23.h }[6], [x26]\n"
        "st1 { v25.h }[6], [x25]\n"
        "st1 { v27.h }[6], [x24]\n"
        "b 130f\n"
        "123:"  // Height 4: Partial direct writeback: partial_1_12
        "tbz x11, #0, 130f\n"
        "st1 { v21.h }[4], [x9]\n"
        "st1 { v23.h }[4], [x26]\n"
        "st1 { v25.h }[4], [x25]\n"
        "st1 { v27.h }[4], [x24]\n"
        "b 130f\n"
        "124:"  // Height 4: Partial direct writeback: partial_2_8
        "tbz x11, #1, 125f\n"
        "str s21, [x9], #0x4\n"
        "str s23, [x26], #0x4\n"
        "str s25, [x25], #0x4\n"
        "str s27, [x24], #0x4\n"
        "tbz x11, #0, 130f\n"
        "st1 { v21.h }[2], [x9]\n"
        "st1 { v23.h }[2], [x26]\n"
        "st1 { v25.h }[2], [x25]\n"
        "st1 { v27.h }[2], [x24]\n"
        "b 130f\n"
        "125:"  // Height 4: Partial direct writeback: partial_1_8
        "tbz x11, #0, 130f\n"
        "str h21, [x9, #0x0]\n"
        "str h23, [x26, #0x0]\n"
        "str h25, [x25, #0x0]\n"
        "str h27, [x24, #0x0]\n"
        "b 130f\n"
        "126:"  // Height 4: Partial direct writeback: partial_4_0
        "tbz x11, #2, 128f\n"
        "str d20, [x9], #0x8\n"
        "str d22, [x26], #0x8\n"
        "str d24, [x25], #0x8\n"
        "str d26, [x24], #0x8\n"
        "tbz x11, #1, 127f\n"
        "st1 { v20.s }[2], [x9], #0x4\n"
        "st1 { v22.s }[2], [x26], #0x4\n"
        "st1 { v24.s }[2], [x25], #0x4\n"
        "st1 { v26.s }[2], [x24], #0x4\n"
        "tbz x11, #0, 130f\n"
        "st1 { v20.h }[6], [x9]\n"
        "st1 { v22.h }[6], [x26]\n"
        "st1 { v24.h }[6], [x25]\n"
        "st1 { v26.h }[6], [x24]\n"
        "b 130f\n"
        "127:"  // Height 4: Partial direct writeback: partial_1_4
        "tbz x11, #0, 130f\n"
        "st1 { v20.h }[4], [x9]\n"
        "st1 { v22.h }[4], [x26]\n"
        "st1 { v24.h }[4], [x25]\n"
        "st1 { v26.h }[4], [x24]\n"
        "b 130f\n"
        "128:"  // Height 4: Partial direct writeback: partial_2_0
        "tbz x11, #1, 129f\n"
        "str s20, [x9], #0x4\n"
        "str s22, [x26], #0x4\n"
        "str s24, [x25], #0x4\n"
        "str s26, [x24], #0x4\n"
        "tbz x11, #0, 130f\n"
        "st1 { v20.h }[2], [x9]\n"
        "st1 { v22.h }[2], [x26]\n"
        "st1 { v24.h }[2], [x25]\n"
        "st1 { v26.h }[2], [x24]\n"
        "b 130f\n"
        "129:"  // Height 4: Partial direct writeback: partial_1_0
        "str h20, [x9, #0x0]\n"
        "str h22, [x26, #0x0]\n"
        "str h24, [x25, #0x0]\n"
        "str h26, [x24, #0x0]\n"
        "130:"  // Height 4: Partial direct writeback: Done
        "b 132f\n"
        "131:"  // Height 4: Full writeback
        "str q20, [x9, #0x0]\n"
        "str q21, [x9, #0x10]\n"
        "add x9, x9, #0x20\n"
        "str q22, [x26, #0x0]\n"
        "str q23, [x26, #0x10]\n"
        "str q24, [x25, #0x0]\n"
        "str q25, [x25, #0x10]\n"
        "str q26, [x24, #0x0]\n"
        "str q27, [x24, #0x10]\n"
        "132:"  // Height 4: Writeback done
        "subs x11, x11, #0x10\n"
        "bgt 101b\n"
        "b 200f\n"
        "133:"  // Height 5
        "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
        "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
        "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
        "134:"  // Height 5: Column loop
        "cbz x10, 135f\n"
        "ldr q20, [x10, #0x0]\n"
        "ldr q21, [x10, #0x10]\n"
        "add x10, x10, #0x20\n"
        "mov v22.16b, v20.16b\n"
        "mov v23.16b, v21.16b\n"
        "mov v24.16b, v20.16b\n"
        "mov v25.16b, v21.16b\n"
        "mov v26.16b, v20.16b\n"
        "mov v27.16b, v21.16b\n"
        "mov v28.16b, v20.16b\n"
        "mov v29.16b, v21.16b\n"
        "b 146f\n"
        "135:"  // Height 5: no bias
        "tbz %x[flags], #0, 145f\n"
        "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
        "cmp x11, #0x10\n"
        "add x26, x9, x20, LSL #1\n"
        "add x25, x26, x20, LSL #1\n"
        "add x24, x25, x20, LSL #1\n"
        "add x23, x24, x20, LSL #1\n"
        "bge 144f\n"
        "tbz x11, #3, 139f\n"
        "ld1 { v20.8h }, [x9], #0x10\n"
        "ld1 { v22.8h }, [x26], #0x10\n"
        "ld1 { v24.8h }, [x25], #0x10\n"
        "ld1 { v26.8h }, [x24], #0x10\n"
        "ld1 { v28.8h }, [x23], #0x10\n"
        "tbz x11, #2, 137f\n"
        "ldr d21, [x9], #0x8\n"
        "ldr d23, [x26], #0x8\n"
        "ldr d25, [x25], #0x8\n"
        "ldr d27, [x24], #0x8\n"
        "ldr d29, [x23], #0x8\n"
        "tbz x11, #1, 136f\n"
        "ld1 { v21.s }[2], [x9], #0x4\n"
        "ld1 { v23.s }[2], [x26], #0x4\n"
        "mov x20, #0x1c\n"
        "ld1 { v25.s }[2], [x25], #0x4\n"
        "ld1 { v27.s }[2], [x24], #0x4\n"
        "ld1 { v29.s }[2], [x23], #0x4\n"
        "tbz x11, #0, 143f\n"
        "ld1 { v21.h }[6], [x9]\n"
        "ld1 { v23.h }[6], [x26]\n"
        "ld1 { v25.h }[6], [x25]\n"
        "ld1 { v27.h }[6], [x24]\n"
        "ld1 { v29.h }[6], [x23]\n"
        "b 143f\n"
        "136:"  // Height 5: Partial accumulate: partial_1_12
        "mov x20, #0x18\n"
        "tbz x11, #0, 143f\n"
        "ld1 { v21.h }[4], [x9]\n"
        "ld1 { v23.h }[4], [x26]\n"
        "ld1 { v25.h }[4], [x25]\n"
        "ld1 { v27.h }[4], [x24]\n"
        "ld1 { v29.h }[4], [x23]\n"
        "b 143f\n"
        "137:"  // Height 5: Partial accumulate: partial_2_8
        "tbz x11, #1, 138f\n"
        "ldr s21, [x9], #0x4\n"
        "ldr s23, [x26], #0x4\n"
        "mov x20, #0x14\n"
        "ldr s25, [x25], #0x4\n"
        "ldr s27, [x24], #0x4\n"
        "ldr s29, [x23], #0x4\n"
        "tbz x11, #0, 143f\n"
        "ld1 { v21.h }[2], [x9]\n"
        "ld1 { v23.h }[2], [x26]\n"
        "ld1 { v25.h }[2], [x25]\n"
        "ld1 { v27.h }[2], [x24]\n"
        "ld1 { v29.h }[2], [x23]\n"
        "b 143f\n"
        "138:"  // Height 5: Partial accumulate: partial_1_8
        "mov x20, #0x10\n"
        "tbz x11, #0, 143f\n"
        "ldr h21, [x9, #0x0]\n"
        "ldr h23, [x26, #0x0]\n"
        "ldr h25, [x25, #0x0]\n"
        "ldr h27, [x24, #0x0]\n"
        "ldr h29, [x23, #0x0]\n"
        "b 143f\n"
        "139:"  // Height 5: Partial accumulate: partial_4_0
        "tbz x11, #2, 141f\n"
        "ldr d20, [x9], #0x8\n"
        "ldr d22, [x26], #0x8\n"
        "ldr d24, [x25], #0x8\n"
        "ldr d26, [x24], #0x8\n"
        "ldr d28, [x23], #0x8\n"
        "tbz x11, #1, 140f\n"
        "ld1 { v20.s }[2], [x9], #0x4\n"
        "ld1 { v22.s }[2], [x26], #0x4\n"
        "mov x20, #0xc\n"
        "ld1 { v24.s }[2], [x25], #0x4\n"
        "ld1 { v26.s }[2], [x24], #0x4\n"
        "ld1 { v28.s }[2], [x23], #0x4\n"
        "tbz x11, #0, 143f\n"
        "ld1 { v20.h }[6], [x9]\n"
        "ld1 { v22.h }[6], [x26]\n"
        "ld1 { v24.h }[6], [x25]\n"
        "ld1 { v26.h }[6], [x24]\n"
        "ld1 { v28.h }[6], [x23]\n"
        "b 143f\n"
        "140:"  // Height 5: Partial accumulate: partial_1_4
        "mov x20, #0x8\n"
        "tbz x11, #0, 143f\n"
        "ld1 { v20.h }[4], [x9]\n"
        "ld1 { v22.h }[4], [x26]\n"
        "ld1 { v24.h }[4], [x25]\n"
        "ld1 { v26.h }[4], [x24]\n"
        "ld1 { v28.h }[4], [x23]\n"
        "b 143f\n"
        "141:"  // Height 5: Partial accumulate: partial_2_0
        "tbz x11, #1, 142f\n"
        "ldr s20, [x9], #0x4\n"
        "ldr s22, [x26], #0x4\n"
        "mov x20, #0x4\n"
        "ldr s24, [x25], #0x4\n"
        "ldr s26, [x24], #0x4\n"
        "ldr s28, [x23], #0x4\n"
        "tbz x11, #0, 143f\n"
        "ld1 { v20.h }[2], [x9]\n"
        "ld1 { v22.h }[2], [x26]\n"
        "ld1 { v24.h }[2], [x25]\n"
        "ld1 { v26.h }[2], [x24]\n"
        "ld1 { v28.h }[2], [x23]\n"
        "b 143f\n"
        "142:"  // Height 5: Partial accumulate: partial_1_0
        "ldr h20, [x9, #0x0]\n"
        "ldr h22, [x26, #0x0]\n"
        "mov x20, #0x0\n"
        "ldr h24, [x25, #0x0]\n"
        "ldr h26, [x24, #0x0]\n"
        "ldr h28, [x23, #0x0]\n"
        "143:"  // Height 5: Partial accumulate: Done
        "sub x9, x9, x20\n"
        "b 146f\n"
        "144:"  // Height 5: full accumulate
        "ldr q20, [x9, #0x0]\n"
        "ldr q21, [x9, #0x10]\n"
        "ldr q22, [x26, #0x0]\n"
        "ldr q23, [x26, #0x10]\n"
        "ldr q24, [x25, #0x0]\n"
        "ldr q25, [x25, #0x10]\n"
        "ldr q26, [x24, #0x0]\n"
        "ldr q27, [x24, #0x10]\n"
        "ldr q28, [x23, #0x0]\n"
        "ldr q29, [x23, #0x10]\n"
        "b 146f\n"
        "145:"  // Height 5: no accumulate
        "movi v20.16b, #0x0\n"
        "movi v21.16b, #0x0\n"
        "movi v22.16b, #0x0\n"
        "movi v23.16b, #0x0\n"
        "movi v24.16b, #0x0\n"
        "movi v25.16b, #0x0\n"
        "movi v26.16b, #0x0\n"
        "movi v27.16b, #0x0\n"
        "movi v28.16b, #0x0\n"
        "movi v29.16b, #0x0\n"
        "146:"  // Height 5: setup done
        "mov x28, #0x0\n"
        "147:"  // Height 5: String loop
        "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
        "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
        "ldr w27, [x20, x28, LSL #0x2]\n"
        "tbz %x[flags], #3, 148f\n"
        "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
        "add x20, x20, x21, LSL #3\n"
        "ldr x26, [x20, #0x0]\n"
        "ldr x25, [x20, #0x8]\n"
        "ldr x24, [x20, #0x10]\n"
        "ldr x23, [x20, #0x18]\n"
        "ldr x22, [x20, #0x20]\n"
        "cbnz x28, 149f\n"
        "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
        "add x26, x26, x20, LSL #1\n"
        "add x25, x25, x20, LSL #1\n"
        "add x24, x24, x20, LSL #1\n"
        "add x23, x23, x20, LSL #1\n"
        "add x22, x22, x20, LSL #1\n"
        "b 149f\n"
        "148:"  // Height 5: setup direct input
        "mov x26, %x[input_ptr]\n"
        "add x25, x26, x21, LSL #1\n"
        "add x24, x25, x21, LSL #1\n"
        "add x23, x24, x21, LSL #1\n"
        "add x22, x23, x21, LSL #1\n"
        "149:"  // Height 5: input setup done
        "cmp x27, #0x8\n"
        "blt 152f\n"
        "ldr q0, [x26, #0x0]\n"
        "ldr q1, [x25, #0x0]\n"
        "cmp x27, #0x10\n"
        "ldr q2, [x24, #0x0]\n"
        "ldr q3, [x23, #0x0]\n"
        "ldr q4, [x22, #0x0]\n"
        "ldr q6, [x10, #0x0]\n"
        "ldr q7, [x10, #0x10]\n"
        "ldr q8, [x10, #0x20]\n"
        "ldr q9, [x10, #0x30]\n"
        "ldr q10, [x10, #0x40]\n"
        "ldr q11, [x10, #0x50]\n"
        "ldr q12, [x10, #0x60]\n"
        "ldr q13, [x10, #0x70]\n"
        "ldr q14, [x10, #0x80]\n"
        "ldr q15, [x10, #0x90]\n"
        "ldr q16, [x10, #0xa0]\n"
        "ldr q17, [x10, #0xb0]\n"
        "ldr q18, [x10, #0xc0]\n"
        "ldr q19, [x10, #0xd0]\n"
        "blt 151f\n"
        "150:"  // Height 5: Multiply loop: Main loop head
        "fmla v20.8h, v6.8h, v0.h[0]\n"
        "fmla v22.8h, v6.8h, v1.h[0]\n"
        "sub x27, x27, #0x8\n"
        "add x26, x26, #0x10\n"
        "fmla v24.8h, v6.8h, v2.h[0]\n"
        "fmla v26.8h, v6.8h, v3.h[0]\n"
        "add x25, x25, #0x10\n"
        "add x24, x24, #0x10\n"
        "fmla v28.8h, v6.8h, v4.h[0]\n"
        "ldr q6, [x10, #0xe0]\n"
        "fmla v21.8h, v7.8h, v0.h[0]\n"
        "add x23, x23, #0x10\n"
        "fmla v23.8h, v7.8h, v1.h[0]\n"
        "fmla v25.8h, v7.8h, v2.h[0]\n"
        "add x22, x22, #0x10\n"
        "cmp x27, #0x10\n"
        "fmla v27.8h, v7.8h, v3.h[0]\n"
        "fmla v29.8h, v7.8h, v4.h[0]\n"
        "ldr q7, [x10, #0xf0]\n"
        "add x10, x10, #0x100\n"
        "fmla v20.8h, v8.8h, v0.h[1]\n"
        "fmla v22.8h, v8.8h, v1.h[1]\n"
        "prfm pldl1keep, [x26, #0x80]\n"
        "prfm pldl1keep, [x25, #0x80]\n"
        "fmla v24.8h, v8.8h, v2.h[1]\n"
        "fmla v26.8h, v8.8h, v3.h[1]\n"
        "prfm pldl1keep, [x24, #0x80]\n"
        "prfm pldl1keep, [x23, #0x80]\n"
        "fmla v28.8h, v8.8h, v4.h[1]\n"
        "ldr q8, [x10, #0x20]\n"
        "fmla v21.8h, v9.8h, v0.h[1]\n"
        "prfm pldl1keep, [x22, #0x80]\n"
        "fmla v23.8h, v9.8h, v1.h[1]\n"
        "fmla v25.8h, v9.8h, v2.h[1]\n"
        "fmla v27.8h, v9.8h, v3.h[1]\n"
        "fmla v29.8h, v9.8h, v4.h[1]\n"
        "ldr q9, [x10, #0x30]\n"
        "fmla v20.8h, v10.8h, v0.h[2]\n"
        "fmla v22.8h, v10.8h, v1.h[2]\n"
        "fmla v24.8h, v10.8h, v2.h[2]\n"
        "fmla v26.8h, v10.8h, v3.h[2]\n"
        "fmla v28.8h, v10.8h, v4.h[2]\n"
        "ldr q10, [x10, #0x40]\n"
        "fmla v21.8h, v11.8h, v0.h[2]\n"
        "fmla v23.8h, v11.8h, v1.h[2]\n"
        "fmla v25.8h, v11.8h, v2.h[2]\n"
        "fmla v27.8h, v11.8h, v3.h[2]\n"
        "fmla v29.8h, v11.8h, v4.h[2]\n"
        "ldr q11, [x10, #0x50]\n"
        "fmla v20.8h, v12.8h, v0.h[3]\n"
        "fmla v22.8h, v12.8h, v1.h[3]\n"
        "fmla v24.8h, v12.8h, v2.h[3]\n"
        "fmla v26.8h, v12.8h, v3.h[3]\n"
        "fmla v28.8h, v12.8h, v4.h[3]\n"
        "ldr q12, [x10, #0x60]\n"
        "fmla v21.8h, v13.8h, v0.h[3]\n"
        "fmla v23.8h, v13.8h, v1.h[3]\n"
        "fmla v25.8h, v13.8h, v2.h[3]\n"
        "fmla v27.8h, v13.8h, v3.h[3]\n"
        "fmla v29.8h, v13.8h, v4.h[3]\n"
        "ldr q13, [x10, #0x70]\n"
        "fmla v20.8h, v14.8h, v0.h[4]\n"
        "fmla v22.8h, v14.8h, v1.h[4]\n"
        "fmla v24.8h, v14.8h, v2.h[4]\n"
        "fmla v26.8h, v14.8h, v3.h[4]\n"
        "fmla v28.8h, v14.8h, v4.h[4]\n"
        "ldr q14, [x10, #0x80]\n"
        "fmla v21.8h, v15.8h, v0.h[4]\n"
        "fmla v23.8h, v15.8h, v1.h[4]\n"
        "fmla v25.8h, v15.8h, v2.h[4]\n"
        "fmla v27.8h, v15.8h, v3.h[4]\n"
        "fmla v29.8h, v15.8h, v4.h[4]\n"
        "ldr q15, [x10, #0x90]\n"
        "fmla v20.8h, v16.8h, v0.h[5]\n"
        "fmla v22.8h, v16.8h, v1.h[5]\n"
        "fmla v24.8h, v16.8h, v2.h[5]\n"
        "fmla v26.8h, v16.8h, v3.h[5]\n"
        "fmla v28.8h, v16.8h, v4.h[5]\n"
        "ldr q16, [x10, #0xa0]\n"
        "fmla v21.8h, v17.8h, v0.h[5]\n"
        "fmla v23.8h, v17.8h, v1.h[5]\n"
        "fmla v25.8h, v17.8h, v2.h[5]\n"
        "fmla v27.8h, v17.8h, v3.h[5]\n"
        "fmla v29.8h, v17.8h, v4.h[5]\n"
        "ldr q17, [x10, #0xb0]\n"
        "fmla v20.8h, v18.8h, v0.h[6]\n"
        "fmla v22.8h, v18.8h, v1.h[6]\n"
        "fmla v24.8h, v18.8h, v2.h[6]\n"
        "fmla v26.8h, v18.8h, v3.h[6]\n"
        "fmla v28.8h, v18.8h, v4.h[6]\n"
        "ldr q18, [x10, #0xc0]\n"
        "fmla v21.8h, v19.8h, v0.h[6]\n"
        "fmla v23.8h, v19.8h, v1.h[6]\n"
        "fmla v25.8h, v19.8h, v2.h[6]\n"
        "fmla v27.8h, v19.8h, v3.h[6]\n"
        "fmla v29.8h, v19.8h, v4.h[6]\n"
        "ldr q19, [x10, #0xd0]\n"
        "fmla v20.8h, v6.8h, v0.h[7]\n"
        "fmla v22.8h, v6.8h, v1.h[7]\n"
        "fmla v24.8h, v6.8h, v2.h[7]\n"
        "fmla v26.8h, v6.8h, v3.h[7]\n"
        "fmla v28.8h, v6.8h, v4.h[7]\n"
        "ldr q6, [x10, #0x0]\n"
        "fmla v21.8h, v7.8h, v0.h[7]\n"
        "ldr q0, [x26, #0x0]\n"
        "fmla v23.8h, v7.8h, v1.h[7]\n"
        "ldr q1, [x25, #0x0]\n"
        "fmla v25.8h, v7.8h, v2.h[7]\n"
        "ldr q2, [x24, #0x0]\n"
        "fmla v27.8h, v7.8h, v3.h[7]\n"
        "ldr q3, [x23, #0x0]\n"
        "fmla v29.8h, v7.8h, v4.h[7]\n"
        "ldr q4, [x22, #0x0]\n"
        "ldr q7, [x10, #0x10]\n"
        "bge 150b\n"
        "151:"  // Height 5: Multiply loop: Single iteration only
        "fmla v20.8h, v6.8h, v0.h[0]\n"
        "fmla v22.8h, v6.8h, v1.h[0]\n"
        "add x26, x26, #0x10\n"
        "add x25, x25, #0x10\n"
        "fmla v24.8h, v6.8h, v2.h[0]\n"
        "fmla v26.8h, v6.8h, v3.h[0]\n"
        "add x24, x24, #0x10\n"
        "add x23, x23, #0x10\n"
        "fmla v28.8h, v6.8h, v4.h[0]\n"
        "ldr q6, [x10, #0xe0]\n"
        "fmla v21.8h, v7.8h, v0.h[0]\n"
        "add x22, x22, #0x10\n"
        "fmla v23.8h, v7.8h, v1.h[0]\n"
        "fmla v25.8h, v7.8h, v2.h[0]\n"
        "sub x27, x27, #0x8\n"
        "prfm pldl1keep, [x26, #0x80]\n"
        "fmla v27.8h, v7.8h, v3.h[0]\n"
        "fmla v29.8h, v7.8h, v4.h[0]\n"
        "ldr q7, [x10, #0xf0]\n"
        "prfm pldl1keep, [x25, #0x80]\n"
        "fmla v20.8h, v8.8h, v0.h[1]\n"
        "fmla v22.8h, v8.8h, v1.h[1]\n"
        "prfm pldl1keep, [x24, #0x80]\n"
        "prfm pldl1keep, [x23, #0x80]\n"
        "fmla v24.8h, v8.8h, v2.h[1]\n"
        "fmla v26.8h, v8.8h, v3.h[1]\n"
        "prfm pldl1keep, [x22, #0x80]\n"
        "add x10, x10, #0x100\n"
        "fmla v28.8h, v8.8h, v4.h[1]\n"
        "fmla v21.8h, v9.8h, v0.h[1]\n"
        "fmla v23.8h, v9.8h, v1.h[1]\n"
        "fmla v25.8h, v9.8h, v2.h[1]\n"
        "fmla v27.8h, v9.8h, v3.h[1]\n"
        "fmla v29.8h, v9.8h, v4.h[1]\n"
        "fmla v20.8h, v10.8h, v0.h[2]\n"
        "fmla v22.8h, v10.8h, v1.h[2]\n"
        "fmla v24.8h, v10.8h, v2.h[2]\n"
        "fmla v26.8h, v10.8h, v3.h[2]\n"
        "fmla v28.8h, v10.8h, v4.h[2]\n"
        "fmla v21.8h, v11.8h, v0.h[2]\n"
        "fmla v23.8h, v11.8h, v1.h[2]\n"
        "fmla v25.8h, v11.8h, v2.h[2]\n"
        "fmla v27.8h, v11.8h, v3.h[2]\n"
        "fmla v29.8h, v11.8h, v4.h[2]\n"
        "fmla v20.8h, v12.8h, v0.h[3]\n"
        "fmla v22.8h, v12.8h, v1.h[3]\n"
        "fmla v24.8h, v12.8h, v2.h[3]\n"
        "fmla v26.8h, v12.8h, v3.h[3]\n"
        "fmla v28.8h, v12.8h, v4.h[3]\n"
        "fmla v21.8h, v13.8h, v0.h[3]\n"
        "fmla v23.8h, v13.8h, v1.h[3]\n"
        "fmla v25.8h, v13.8h, v2.h[3]\n"
        "fmla v27.8h, v13.8h, v3.h[3]\n"
        "fmla v29.8h, v13.8h, v4.h[3]\n"
        "fmla v20.8h, v14.8h, v0.h[4]\n"
        "fmla v22.8h, v14.8h, v1.h[4]\n"
        "fmla v24.8h, v14.8h, v2.h[4]\n"
        "fmla v26.8h, v14.8h, v3.h[4]\n"
        "fmla v28.8h, v14.8h, v4.h[4]\n"
        "fmla v21.8h, v15.8h, v0.h[4]\n"
        "fmla v23.8h, v15.8h, v1.h[4]\n"
        "fmla v25.8h, v15.8h, v2.h[4]\n"
        "fmla v27.8h, v15.8h, v3.h[4]\n"
        "fmla v29.8h, v15.8h, v4.h[4]\n"
        "fmla v20.8h, v16.8h, v0.h[5]\n"
        "fmla v22.8h, v16.8h, v1.h[5]\n"
        "fmla v24.8h, v16.8h, v2.h[5]\n"
        "fmla v26.8h, v16.8h, v3.h[5]\n"
        "fmla v28.8h, v16.8h, v4.h[5]\n"
        "fmla v21.8h, v17.8h, v0.h[5]\n"
        "fmla v23.8h, v17.8h, v1.h[5]\n"
        "fmla v25.8h, v17.8h, v2.h[5]\n"
        "fmla v27.8h, v17.8h, v3.h[5]\n"
        "fmla v29.8h, v17.8h, v4.h[5]\n"
        "fmla v20.8h, v18.8h, v0.h[6]\n"
        "fmla v22.8h, v18.8h, v1.h[6]\n"
        "fmla v24.8h, v18.8h, v2.h[6]\n"
        "fmla v26.8h, v18.8h, v3.h[6]\n"
        "fmla v28.8h, v18.8h, v4.h[6]\n"
        "fmla v21.8h, v19.8h, v0.h[6]\n"
        "fmla v23.8h, v19.8h, v1.h[6]\n"
        "fmla v25.8h, v19.8h, v2.h[6]\n"
        "fmla v27.8h, v19.8h, v3.h[6]\n"
        "fmla v29.8h, v19.8h, v4.h[6]\n"
        "fmla v20.8h, v6.8h, v0.h[7]\n"
        "fmla v22.8h, v6.8h, v1.h[7]\n"
        "fmla v24.8h, v6.8h, v2.h[7]\n"
        "fmla v26.8h, v6.8h, v3.h[7]\n"
        "fmla v28.8h, v6.8h, v4.h[7]\n"
        "fmla v21.8h, v7.8h, v0.h[7]\n"
        "fmla v23.8h, v7.8h, v1.h[7]\n"
        "fmla v25.8h, v7.8h, v2.h[7]\n"
        "fmla v27.8h, v7.8h, v3.h[7]\n"
        "fmla v29.8h, v7.8h, v4.h[7]\n"
        "152:"  // Height 5: Multiply loop: Main loop skip
        "cbz x27, 154f\n"
        "153:"  // Height 5: Multiply loop: Odd block loop
        "ldr h0, [x26], #0x2\n"
        "ldr h1, [x25], #0x2\n"
        "sub x27, x27, #0x1\n"
        "ldr h2, [x24], #0x2\n"
        "ldr h3, [x23], #0x2\n"
        "ldr h4, [x22], #0x2\n"
        "ldr q8, [x10, #0x0]\n"
        "ldr q9, [x10, #0x10]\n"
        "add x10, x10, #0x20\n"
        "fmla v20.8h, v8.8h, v0.h[0]\n"
        "fmla v22.8h, v8.8h, v1.h[0]\n"
        "fmla v24.8h, v8.8h, v2.h[0]\n"
        "fmla v26.8h, v8.8h, v3.h[0]\n"
        "fmla v28.8h, v8.8h, v4.h[0]\n"
        "fmla v21.8h, v9.8h, v0.h[0]\n"
        "fmla v23.8h, v9.8h, v1.h[0]\n"
        "fmla v25.8h, v9.8h, v2.h[0]\n"
        "fmla v27.8h, v9.8h, v3.h[0]\n"
        "fmla v29.8h, v9.8h, v4.h[0]\n"
        "cbnz x27, 153b\n"
        "154:"  // Height 5: Multiply loop: No odd multiplies
        "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
        "add x28, x28, #0x1\n"
        "cmp x28, x20\n"
        "bne 147b\n"
        "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
        "prfm pstl1keep, [x9, #0x0]\n"
        "add x26, x9, x20, LSL #1\n"
        "prfm pstl1keep, [x26, #0x0]\n"
        "add x25, x26, x20, LSL #1\n"
        "prfm pstl1keep, [x25, #0x0]\n"
        "add x24, x25, x20, LSL #1\n"
        "prfm pstl1keep, [x24, #0x0]\n"
        "add x23, x24, x20, LSL #1\n"
        "prfm pstl1keep, [x23, #0x0]\n"
        "tbz %x[flags], #1, 155f\n"
        "add x21, %x[args_ptr], %[offset_max]\n"
        "add x20, %x[args_ptr], %[offset_min]\n"
        "ld1r { v17.8h }, [x21]\n"
        "ld1r { v16.8h }, [x20]\n"
        "fmin v20.8h, v20.8h, v17.8h\n"
        "fmin v21.8h, v21.8h, v17.8h\n"
        "fmin v22.8h, v22.8h, v17.8h\n"
        "fmin v23.8h, v23.8h, v17.8h\n"
        "fmin v24.8h, v24.8h, v17.8h\n"
        "fmin v25.8h, v25.8h, v17.8h\n"
        "fmin v26.8h, v26.8h, v17.8h\n"
        "fmin v27.8h, v27.8h, v17.8h\n"
        "fmin v28.8h, v28.8h, v17.8h\n"
        "fmin v29.8h, v29.8h, v17.8h\n"
        "fmax v20.8h, v20.8h, v16.8h\n"
        "fmax v21.8h, v21.8h, v16.8h\n"
        "fmax v22.8h, v22.8h, v16.8h\n"
        "fmax v23.8h, v23.8h, v16.8h\n"
        "fmax v24.8h, v24.8h, v16.8h\n"
        "fmax v25.8h, v25.8h, v16.8h\n"
        "fmax v26.8h, v26.8h, v16.8h\n"
        "fmax v27.8h, v27.8h, v16.8h\n"
        "fmax v28.8h, v28.8h, v16.8h\n"
        "fmax v29.8h, v29.8h, v16.8h\n"
        "155:"  // Height 5: No activation
        "cmp x11, #0x10\n"
        "bge 164f\n"
        "tbz x11, #3, 159f\n"
        "st1 { v20.8h }, [x9], #0x10\n"
        "st1 { v22.8h }, [x26], #0x10\n"
        "st1 { v24.8h }, [x25], #0x10\n"
        "st1 { v26.8h }, [x24], #0x10\n"
        "st1 { v28.8h }, [x23], #0x10\n"
        "tbz x11, #2, 157f\n"
        "str d21, [x9], #0x8\n"
        "str d23, [x26], #0x8\n"
        "str d25, [x25], #0x8\n"
        "str d27, [x24], #0x8\n"
        "str d29, [x23], #0x8\n"
        "tbz x11, #1, 156f\n"
        "st1 { v21.s }[2], [x9], #0x4\n"
        "st1 { v23.s }[2], [x26], #0x4\n"
        "st1 { v25.s }[2], [x25], #0x4\n"
        "st1 { v27.s }[2], [x24], #0x4\n"
        "st1 { v29.s }[2], [x23], #0x4\n"
        "tbz x11, #0, 163f\n"
        "st1 { v21.h }[6], [x9]\n"
        "st1 { v23.h }[6], [x26]\n"
        "st1 { v25.h }[6], [x25]\n"
        "st1 { v27.h }[6], [x24]\n"
        "st1 { v29.h }[6], [x23]\n"
        "b 163f\n"
        "156:"  // Height 5: Partial direct writeback: partial_1_12
        "tbz x11, #0, 163f\n"
        "st1 { v21.h }[4], [x9]\n"
        "st1 { v23.h }[4], [x26]\n"
        "st1 { v25.h }[4], [x25]\n"
        "st1 { v27.h }[4], [x24]\n"
        "st1 { v29.h }[4], [x23]\n"
        "b 163f\n"
        "157:"  // Height 5: Partial direct writeback: partial_2_8
        "tbz x11, #1, 158f\n"
        "str s21, [x9], #0x4\n"
        "str s23, [x26], #0x4\n"
        "str s25, [x25], #0x4\n"
        "str s27, [x24], #0x4\n"
        "str s29, [x23], #0x4\n"
        "tbz x11, #0, 163f\n"
        "st1 { v21.h }[2], [x9]\n"
        "st1 { v23.h }[2], [x26]\n"
        "st1 { v25.h }[2], [x25]\n"
        "st1 { v27.h }[2], [x24]\n"
        "st1 { v29.h }[2], [x23]\n"
        "b 163f\n"
        "158:"  // Height 5: Partial direct writeback: partial_1_8
        "tbz x11, #0, 163f\n"
        "str h21, [x9, #0x0]\n"
        "str h23, [x26, #0x0]\n"
        "str h25, [x25, #0x0]\n"
        "str h27, [x24, #0x0]\n"
        "str h29, [x23, #0x0]\n"
        "b 163f\n"
        "159:"  // Height 5: Partial direct writeback: partial_4_0
        "tbz x11, #2, 161f\n"
        "str d20, [x9], #0x8\n"
        "str d22, [x26], #0x8\n"
        "str d24, [x25], #0x8\n"
        "str d26, [x24], #0x8\n"
        "str d28, [x23], #0x8\n"
        "tbz x11, #1, 160f\n"
        "st1 { v20.s }[2], [x9], #0x4\n"
        "st1 { v22.s }[2], [x26], #0x4\n"
        "st1 { v24.s }[2], [x25], #0x4\n"
        "st1 { v26.s }[2], [x24], #0x4\n"
        "st1 { v28.s }[2], [x23], #0x4\n"
        "tbz x11, #0, 163f\n"
        "st1 { v20.h }[6], [x9]\n"
        "st1 { v22.h }[6], [x26]\n"
        "st1 { v24.h }[6], [x25]\n"
        "st1 { v26.h }[6], [x24]\n"
        "st1 { v28.h }[6], [x23]\n"
        "b 163f\n"
        "160:"  // Height 5: Partial direct writeback: partial_1_4
        "tbz x11, #0, 163f\n"
        "st1 { v20.h }[4], [x9]\n"
        "st1 { v22.h }[4], [x26]\n"
        "st1 { v24.h }[4], [x25]\n"
        "st1 { v26.h }[4], [x24]\n"
        "st1 { v28.h }[4], [x23]\n"
        "b 163f\n"
        "161:"  // Height 5: Partial direct writeback: partial_2_0
        "tbz x11, #1, 162f\n"
        "str s20, [x9], #0x4\n"
        "str s22, [x26], #0x4\n"
        "str s24, [x25], #0x4\n"
        "str s26, [x24], #0x4\n"
        "str s28, [x23], #0x4\n"
        "tbz x11, #0, 163f\n"
        "st1 { v20.h }[2], [x9]\n"
        "st1 { v22.h }[2], [x26]\n"
        "st1 { v24.h }[2], [x25]\n"
        "st1 { v26.h }[2], [x24]\n"
        "st1 { v28.h }[2], [x23]\n"
        "b 163f\n"
        "162:"  // Height 5: Partial direct writeback: partial_1_0
        "str h20, [x9, #0x0]\n"
        "str h22, [x26, #0x0]\n"
        "str h24, [x25, #0x0]\n"
        "str h26, [x24, #0x0]\n"
        "str h28, [x23, #0x0]\n"
        "163:"  // Height 5: Partial direct writeback: Done
        "b 165f\n"
        "164:"  // Height 5: Full writeback
        "str q20, [x9, #0x0]\n"
        "str q21, [x9, #0x10]\n"
        "add x9, x9, #0x20\n"
        "str q22, [x26, #0x0]\n"
        "str q23, [x26, #0x10]\n"
        "str q24, [x25, #0x0]\n"
        "str q25, [x25, #0x10]\n"
        "str q26, [x24, #0x0]\n"
        "str q27, [x24, #0x10]\n"
        "str q28, [x23, #0x0]\n"
        "str q29, [x23, #0x10]\n"
        "165:"  // Height 5: Writeback done
        "subs x11, x11, #0x10\n"
        "bgt 134b\n"
        "b 200f\n"
        "166:"  // Height 6
        "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
        "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
        "mov x20, #0xc\n"
        "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
        "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
        "madd x20, x21, x20, x9\n"
        "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
        "167:"  // Height 6: Column loop
        "cbz x10, 168f\n"
        "ldr q20, [x10, #0x0]\n"
        "ldr q21, [x10, #0x10]\n"
        "add x10, x10, #0x20\n"
        "mov v22.16b, v20.16b\n"
        "mov v23.16b, v21.16b\n"
        "mov v24.16b, v20.16b\n"
        "mov v25.16b, v21.16b\n"
        "mov v26.16b, v20.16b\n"
        "mov v27.16b, v21.16b\n"
        "mov v28.16b, v20.16b\n"
        "mov v29.16b, v21.16b\n"
        "mov v30.16b, v20.16b\n"
        "mov v31.16b, v21.16b\n"
        "b 179f\n"
        "168:"  // Height 6: no bias
        "tbz %x[flags], #0, 178f\n"
        "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
        "cmp x11, #0x10\n"
        "add x26, x9, x20, LSL #1\n"
        "add x25, x26, x20, LSL #1\n"
        "add x24, x25, x20, LSL #1\n"
        "add x23, x24, x20, LSL #1\n"
        "add x22, x23, x20, LSL #1\n"
        "bge 177f\n"
        "tbz x11, #3, 172f\n"
        "ld1 { v20.8h }, [x9], #0x10\n"
        "ld1 { v22.8h }, [x26], #0x10\n"
        "ld1 { v24.8h }, [x25], #0x10\n"
        "ld1 { v26.8h }, [x24], #0x10\n"
        "ld1 { v28.8h }, [x23], #0x10\n"
        "ld1 { v30.8h }, [x22], #0x10\n"
        "tbz x11, #2, 170f\n"
        "ldr d21, [x9], #0x8\n"
        "ldr d23, [x26], #0x8\n"
        "ldr d25, [x25], #0x8\n"
        "ldr d27, [x24], #0x8\n"
        "ldr d29, [x23], #0x8\n"
        "ldr d31, [x22], #0x8\n"
        "tbz x11, #1, 169f\n"
        "ld1 { v21.s }[2], [x9], #0x4\n"
        "ld1 { v23.s }[2], [x26], #0x4\n"
        "mov x20, #0x1c\n"
        "ld1 { v25.s }[2], [x25], #0x4\n"
        "ld1 { v27.s }[2], [x24], #0x4\n"
        "ld1 { v29.s }[2], [x23], #0x4\n"
        "ld1 { v31.s }[2], [x22], #0x4\n"
        "tbz x11, #0, 176f\n"
        "ld1 { v21.h }[6], [x9]\n"
        "ld1 { v23.h }[6], [x26]\n"
        "ld1 { v25.h }[6], [x25]\n"
        "ld1 { v27.h }[6], [x24]\n"
        "ld1 { v29.h }[6], [x23]\n"
        "ld1 { v31.h }[6], [x22]\n"
        "b 176f\n"
        "169:"  // Height 6: Partial accumulate: partial_1_12
        "mov x20, #0x18\n"
        "tbz x11, #0, 176f\n"
        "ld1 { v21.h }[4], [x9]\n"
        "ld1 { v23.h }[4], [x26]\n"
        "ld1 { v25.h }[4], [x25]\n"
        "ld1 { v27.h }[4], [x24]\n"
        "ld1 { v29.h }[4], [x23]\n"
        "ld1 { v31.h }[4], [x22]\n"
        "b 176f\n"
        "170:"  // Height 6: Partial accumulate: partial_2_8
        "tbz x11, #1, 171f\n"
        "ldr s21, [x9], #0x4\n"
        "ldr s23, [x26], #0x4\n"
        "mov x20, #0x14\n"
        "ldr s25, [x25], #0x4\n"
        "ldr s27, [x24], #0x4\n"
        "ldr s29, [x23], #0x4\n"
        "ldr s31, [x22], #0x4\n"
        "tbz x11, #0, 176f\n"
        "ld1 { v21.h }[2], [x9]\n"
        "ld1 { v23.h }[2], [x26]\n"
        "ld1 { v25.h }[2], [x25]\n"
        "ld1 { v27.h }[2], [x24]\n"
        "ld1 { v29.h }[2], [x23]\n"
        "ld1 { v31.h }[2], [x22]\n"
        "b 176f\n"
        "171:"  // Height 6: Partial accumulate: partial_1_8
        "mov x20, #0x10\n"
        "tbz x11, #0, 176f\n"
        "ldr h21, [x9, #0x0]\n"
        "ldr h23, [x26, #0x0]\n"
        "ldr h25, [x25, #0x0]\n"
        "ldr h27, [x24, #0x0]\n"
        "ldr h29, [x23, #0x0]\n"
        "ldr h31, [x22, #0x0]\n"
        "b 176f\n"
        "172:"  // Height 6: Partial accumulate: partial_4_0
        "tbz x11, #2, 174f\n"
        "ldr d20, [x9], #0x8\n"
        "ldr d22, [x26], #0x8\n"
        "ldr d24, [x25], #0x8\n"
        "ldr d26, [x24], #0x8\n"
        "ldr d28, [x23], #0x8\n"
        "ldr d30, [x22], #0x8\n"
        "tbz x11, #1, 173f\n"
        "ld1 { v20.s }[2], [x9], #0x4\n"
        "ld1 { v22.s }[2], [x26], #0x4\n"
        "mov x20, #0xc\n"
        "ld1 { v24.s }[2], [x25], #0x4\n"
        "ld1 { v26.s }[2], [x24], #0x4\n"
        "ld1 { v28.s }[2], [x23], #0x4\n"
        "ld1 { v30.s }[2], [x22], #0x4\n"
        "tbz x11, #0, 176f\n"
        "ld1 { v20.h }[6], [x9]\n"
        "ld1 { v22.h }[6], [x26]\n"
        "ld1 { v24.h }[6], [x25]\n"
        "ld1 { v26.h }[6], [x24]\n"
        "ld1 { v28.h }[6], [x23]\n"
        "ld1 { v30.h }[6], [x22]\n"
        "b 176f\n"
        "173:"  // Height 6: Partial accumulate: partial_1_4
        "mov x20, #0x8\n"
        "tbz x11, #0, 176f\n"
        "ld1 { v20.h }[4], [x9]\n"
        "ld1 { v22.h }[4], [x26]\n"
        "ld1 { v24.h }[4], [x25]\n"
        "ld1 { v26.h }[4], [x24]\n"
        "ld1 { v28.h }[4], [x23]\n"
        "ld1 { v30.h }[4], [x22]\n"
        "b 176f\n"
        "174:"  // Height 6: Partial accumulate: partial_2_0
        "tbz x11, #1, 175f\n"
        "ldr s20, [x9], #0x4\n"
        "ldr s22, [x26], #0x4\n"
        "mov x20, #0x4\n"
        "ldr s24, [x25], #0x4\n"
        "ldr s26, [x24], #0x4\n"
        "ldr s28, [x23], #0x4\n"
        "ldr s30, [x22], #0x4\n"
        "tbz x11, #0, 176f\n"
        "ld1 { v20.h }[2], [x9]\n"
        "ld1 { v22.h }[2], [x26]\n"
        "ld1 { v24.h }[2], [x25]\n"
        "ld1 { v26.h }[2], [x24]\n"
        "ld1 { v28.h }[2], [x23]\n"
        "ld1 { v30.h }[2], [x22]\n"
        "b 176f\n"
        "175:"  // Height 6: Partial accumulate: partial_1_0
        "ldr h20, [x9, #0x0]\n"
        "ldr h22, [x26, #0x0]\n"
        "mov x20, #0x0\n"
        "ldr h24, [x25, #0x0]\n"
        "ldr h26, [x24, #0x0]\n"
        "ldr h28, [x23, #0x0]\n"
        "ldr h30, [x22, #0x0]\n"
        "176:"  // Height 6: Partial accumulate: Done
        "sub x9, x9, x20\n"
        "b 179f\n"
        "177:"  // Height 6: full accumulate
        "ldr q20, [x9, #0x0]\n"
        "ldr q21, [x9, #0x10]\n"
        "ldr q22, [x26, #0x0]\n"
        "ldr q23, [x26, #0x10]\n"
        "ldr q24, [x25, #0x0]\n"
        "ldr q25, [x25, #0x10]\n"
        "ldr q26, [x24, #0x0]\n"
        "ldr q27, [x24, #0x10]\n"
        "ldr q28, [x23, #0x0]\n"
        "ldr q29, [x23, #0x10]\n"
        "ldr q30, [x22, #0x0]\n"
        "ldr q31, [x22, #0x10]\n"
        "b 179f\n"
        "178:"  // Height 6: no accumulate
        "movi v20.16b, #0x0\n"
        "movi v21.16b, #0x0\n"
        "movi v22.16b, #0x0\n"
        "movi v23.16b, #0x0\n"
        "movi v24.16b, #0x0\n"
        "movi v25.16b, #0x0\n"
        "movi v26.16b, #0x0\n"
        "movi v27.16b, #0x0\n"
        "movi v28.16b, #0x0\n"
        "movi v29.16b, #0x0\n"
        "movi v30.16b, #0x0\n"
        "movi v31.16b, #0x0\n"
        "179:"  // Height 6: setup done
        "mov x28, #0x0\n"
        "180:"  // Height 6: String loop
        "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
        "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
        "ldr w27, [x20, x28, LSL #0x2]\n"
        "tbz %x[flags], #3, 181f\n"
        "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
        "add x20, x20, x21, LSL #3\n"
        "ldr x26, [x20, #0x0]\n"
        "ldr x25, [x20, #0x8]\n"
        "ldr x24, [x20, #0x10]\n"
        "ldr x23, [x20, #0x18]\n"
        "ldr x22, [x20, #0x20]\n"
        "ldr x21, [x20, #0x28]\n"
        "cbnz x28, 182f\n"
        "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
        "add x26, x26, x20, LSL #1\n"
        "add x25, x25, x20, LSL #1\n"
        "add x24, x24, x20, LSL #1\n"
        "add x23, x23, x20, LSL #1\n"
        "add x22, x22, x20, LSL #1\n"
        "add x21, x21, x20, LSL #1\n"
        "b 182f\n"
        "181:"  // Height 6: setup direct input
        "mov x26, %x[input_ptr]\n"
        "add x25, x26, x21, LSL #1\n"
        "add x24, x25, x21, LSL #1\n"
        "add x23, x24, x21, LSL #1\n"
        "add x22, x23, x21, LSL #1\n"
        "add x21, x22, x21, LSL #1\n"
        "182:"  // Height 6: input setup done
        "cmp x27, #0x8\n"
        "blt 185f\n"
        "ldr q0, [x26, #0x0]\n"
        "ldr q1, [x25, #0x0]\n"
        "cmp x27, #0x10\n"
        "ldr q2, [x24, #0x0]\n"
        "ldr q3, [x23, #0x0]\n"
        "ldr q4, [x22, #0x0]\n"
        "ldr q5, [x21, #0x0]\n"
        "ldr q6, [x10, #0x0]\n"
        "ldr q7, [x10, #0x10]\n"
        "ldr q8, [x10, #0x20]\n"
        "ldr q9, [x10, #0x30]\n"
        "ldr q10, [x10, #0x40]\n"
        "ldr q11, [x10, #0x50]\n"
        "ldr q12, [x10, #0x60]\n"
        "ldr q13, [x10, #0x70]\n"
        "ldr q14, [x10, #0x80]\n"
        "ldr q15, [x10, #0x90]\n"
        "ldr q16, [x10, #0xa0]\n"
        "ldr q17, [x10, #0xb0]\n"
        "ldr q18, [x10, #0xc0]\n"
        "ldr q19, [x10, #0xd0]\n"
        "blt 184f\n"
        "183:"  // Height 6: Multiply loop: Main loop head
        "fmla v20.8h, v6.8h, v0.h[0]\n"
        "fmla v22.8h, v6.8h, v1.h[0]\n"
        "sub x27, x27, #0x8\n"
        "add x26, x26, #0x10\n"
        "fmla v24.8h, v6.8h, v2.h[0]\n"
        "fmla v26.8h, v6.8h, v3.h[0]\n"
        "add x25, x25, #0x10\n"
        "add x24, x24, #0x10\n"
        "fmla v28.8h, v6.8h, v4.h[0]\n"
        "fmla v30.8h, v6.8h, v5.h[0]\n"
        "ldr q6, [x10, #0xe0]\n"
        "add x23, x23, #0x10\n"
        "fmla v21.8h, v7.8h, v0.h[0]\n"
        "fmla v23.8h, v7.8h, v1.h[0]\n"
        "add x22, x22, #0x10\n"
        "add x21, x21, #0x10\n"
        "fmla v25.8h, v7.8h, v2.h[0]\n"
        "fmla v27.8h, v7.8h, v3.h[0]\n"
        "cmp x27, #0x10\n"
        "prfm pldl1keep, [x26, #0x80]\n"
        "fmla v29.8h, v7.8h, v4.h[0]\n"
        "fmla v31.8h, v7.8h, v5.h[0]\n"
        "ldr q7, [x10, #0xf0]\n"
        "add x10, x10, #0x100\n"
        "fmla v20.8h, v8.8h, v0.h[1]\n"
        "fmla v22.8h, v8.8h, v1.h[1]\n"
        "prfm pldl1keep, [x25, #0x80]\n"
        "prfm pldl1keep, [x24, #0x80]\n"
        "fmla v24.8h, v8.8h, v2.h[1]\n"
        "fmla v26.8h, v8.8h, v3.h[1]\n"
        "prfm pldl1keep, [x23, #0x80]\n"
        "prfm pldl1keep, [x22, #0x80]\n"
        "fmla v28.8h, v8.8h, v4.h[1]\n"
        "fmla v30.8h, v8.8h, v5.h[1]\n"
        "ldr q8, [x10, #0x20]\n"
        "prfm pldl1keep, [x21, #0x80]\n"
        "fmla v21.8h, v9.8h, v0.h[1]\n"
        "fmla v23.8h, v9.8h, v1.h[1]\n"
        "fmla v25.8h, v9.8h, v2.h[1]\n"
        "fmla v27.8h, v9.8h, v3.h[1]\n"
        "fmla v29.8h, v9.8h, v4.h[1]\n"
        "fmla v31.8h, v9.8h, v5.h[1]\n"
        "ldr q9, [x10, #0x30]\n"
        "fmla v20.8h, v10.8h, v0.h[2]\n"
        "fmla v22.8h, v10.8h, v1.h[2]\n"
        "fmla v24.8h, v10.8h, v2.h[2]\n"
        "fmla v26.8h, v10.8h, v3.h[2]\n"
        "fmla v28.8h, v10.8h, v4.h[2]\n"
        "fmla v30.8h, v10.8h, v5.h[2]\n"
        "ldr q10, [x10, #0x40]\n"
        "fmla v21.8h, v11.8h, v0.h[2]\n"
        "fmla v23.8h, v11.8h, v1.h[2]\n"
        "fmla v25.8h, v11.8h, v2.h[2]\n"
        "fmla v27.8h, v11.8h, v3.h[2]\n"
        "fmla v29.8h, v11.8h, v4.h[2]\n"
        "fmla v31.8h, v11.8h, v5.h[2]\n"
        "ldr q11, [x10, #0x50]\n"
        "fmla v20.8h, v12.8h, v0.h[3]\n"
        "fmla v22.8h, v12.8h, v1.h[3]\n"
        "fmla v24.8h, v12.8h, v2.h[3]\n"
        "fmla v26.8h, v12.8h, v3.h[3]\n"
        "fmla v28.8h, v12.8h, v4.h[3]\n"
        "fmla v30.8h, v12.8h, v5.h[3]\n"
        "ldr q12, [x10, #0x60]\n"
        "fmla v21.8h, v13.8h, v0.h[3]\n"
        "fmla v23.8h, v13.8h, v1.h[3]\n"
        "fmla v25.8h, v13.8h, v2.h[3]\n"
        "fmla v27.8h, v13.8h, v3.h[3]\n"
        "fmla v29.8h, v13.8h, v4.h[3]\n"
        "fmla v31.8h, v13.8h, v5.h[3]\n"
        "ldr q13, [x10, #0x70]\n"
        "fmla v20.8h, v14.8h, v0.h[4]\n"
        "fmla v22.8h, v14.8h, v1.h[4]\n"
        "fmla v24.8h, v14.8h, v2.h[4]\n"
        "fmla v26.8h, v14.8h, v3.h[4]\n"
        "fmla v28.8h, v14.8h, v4.h[4]\n"
        "fmla v30.8h, v14.8h, v5.h[4]\n"
        "ldr q14, [x10, #0x80]\n"
        "fmla v21.8h, v15.8h, v0.h[4]\n"
        "fmla v23.8h, v15.8h, v1.h[4]\n"
        "fmla v25.8h, v15.8h, v2.h[4]\n"
        "fmla v27.8h, v15.8h, v3.h[4]\n"
        "fmla v29.8h, v15.8h, v4.h[4]\n"
        "fmla v31.8h, v15.8h, v5.h[4]\n"
        "ldr q15, [x10, #0x90]\n"
        "fmla v20.8h, v16.8h, v0.h[5]\n"
        "fmla v22.8h, v16.8h, v1.h[5]\n"
        "fmla v24.8h, v16.8h, v2.h[5]\n"
        "fmla v26.8h, v16.8h, v3.h[5]\n"
        "fmla v28.8h, v16.8h, v4.h[5]\n"
        "fmla v30.8h, v16.8h, v5.h[5]\n"
        "ldr q16, [x10, #0xa0]\n"
        "fmla v21.8h, v17.8h, v0.h[5]\n"
        "fmla v23.8h, v17.8h, v1.h[5]\n"
        "fmla v25.8h, v17.8h, v2.h[5]\n"
        "fmla v27.8h, v17.8h, v3.h[5]\n"
        "fmla v29.8h, v17.8h, v4.h[5]\n"
        "fmla v31.8h, v17.8h, v5.h[5]\n"
        "ldr q17, [x10, #0xb0]\n"
        "fmla v20.8h, v18.8h, v0.h[6]\n"
        "fmla v22.8h, v18.8h, v1.h[6]\n"
        "fmla v24.8h, v18.8h, v2.h[6]\n"
        "fmla v26.8h, v18.8h, v3.h[6]\n"
        "fmla v28.8h, v18.8h, v4.h[6]\n"
        "fmla v30.8h, v18.8h, v5.h[6]\n"
        "ldr q18, [x10, #0xc0]\n"
        "fmla v21.8h, v19.8h, v0.h[6]\n"
        "fmla v23.8h, v19.8h, v1.h[6]\n"
        "fmla v25.8h, v19.8h, v2.h[6]\n"
        "fmla v27.8h, v19.8h, v3.h[6]\n"
        "fmla v29.8h, v19.8h, v4.h[6]\n"
        "fmla v31.8h, v19.8h, v5.h[6]\n"
        "ldr q19, [x10, #0xd0]\n"
        "fmla v20.8h, v6.8h, v0.h[7]\n"
        "fmla v22.8h, v6.8h, v1.h[7]\n"
        "fmla v24.8h, v6.8h, v2.h[7]\n"
        "fmla v26.8h, v6.8h, v3.h[7]\n"
        "fmla v28.8h, v6.8h, v4.h[7]\n"
        "fmla v30.8h, v6.8h, v5.h[7]\n"
        "ldr q6, [x10, #0x0]\n"
        "fmla v21.8h, v7.8h, v0.h[7]\n"
        "ldr q0, [x26, #0x0]\n"
        "fmla v23.8h, v7.8h, v1.h[7]\n"
        "ldr q1, [x25, #0x0]\n"
        "fmla v25.8h, v7.8h, v2.h[7]\n"
        "ldr q2, [x24, #0x0]\n"
        "fmla v27.8h, v7.8h, v3.h[7]\n"
        "ldr q3, [x23, #0x0]\n"
        "fmla v29.8h, v7.8h, v4.h[7]\n"
        "ldr q4, [x22, #0x0]\n"
        "fmla v31.8h, v7.8h, v5.h[7]\n"
        "ldr q5, [x21, #0x0]\n"
        "ldr q7, [x10, #0x10]\n"
        "bge 183b\n"
        "184:"  // Height 6: Multiply loop: Single iteration only
        "fmla v20.8h, v6.8h, v0.h[0]\n"
        "fmla v22.8h, v6.8h, v1.h[0]\n"
        "add x26, x26, #0x10\n"
        "add x25, x25, #0x10\n"
        "fmla v24.8h, v6.8h, v2.h[0]\n"
        "fmla v26.8h, v6.8h, v3.h[0]\n"
        "add x24, x24, #0x10\n"
        "add x23, x23, #0x10\n"
        "fmla v28.8h, v6.8h, v4.h[0]\n"
        "fmla v30.8h, v6.8h, v5.h[0]\n"
        "ldr q6, [x10, #0xe0]\n"
        "add x22, x22, #0x10\n"
        "fmla v21.8h, v7.8h, v0.h[0]\n"
        "fmla v23.8h, v7.8h, v1.h[0]\n"
        "add x21, x21, #0x10\n"
        "sub x27, x27, #0x8\n"
        "fmla v25.8h, v7.8h, v2.h[0]\n"
        "fmla v27.8h, v7.8h, v3.h[0]\n"
        "prfm pldl1keep, [x26, #0x80]\n"
        "prfm pldl1keep, [x25, #0x80]\n"
        "fmla v29.8h, v7.8h, v4.h[0]\n"
        "fmla v31.8h, v7.8h, v5.h[0]\n"
        "ldr q7, [x10, #0xf0]\n"
        "prfm pldl1keep, [x24, #0x80]\n"
        "fmla v20.8h, v8.8h, v0.h[1]\n"
        "fmla v22.8h, v8.8h, v1.h[1]\n"
        "prfm pldl1keep, [x23, #0x80]\n"
        "prfm pldl1keep, [x22, #0x80]\n"
        "fmla v24.8h, v8.8h, v2.h[1]\n"
        "fmla v26.8h, v8.8h, v3.h[1]\n"
        "prfm pldl1keep, [x21, #0x80]\n"
        "add x10, x10, #0x100\n"
        "fmla v28.8h, v8.8h, v4.h[1]\n"
        "fmla v30.8h, v8.8h, v5.h[1]\n"
        "fmla v21.8h, v9.8h, v0.h[1]\n"
        "fmla v23.8h, v9.8h, v1.h[1]\n"
        "fmla v25.8h, v9.8h, v2.h[1]\n"
        "fmla v27.8h, v9.8h, v3.h[1]\n"
        "fmla v29.8h, v9.8h, v4.h[1]\n"
        "fmla v31.8h, v9.8h, v5.h[1]\n"
        "fmla v20.8h, v10.8h, v0.h[2]\n"
        "fmla v22.8h, v10.8h, v1.h[2]\n"
        "fmla v24.8h, v10.8h, v2.h[2]\n"
        "fmla v26.8h, v10.8h, v3.h[2]\n"
        "fmla v28.8h, v10.8h, v4.h[2]\n"
        "fmla v30.8h, v10.8h, v5.h[2]\n"
        "fmla v21.8h, v11.8h, v0.h[2]\n"
        "fmla v23.8h, v11.8h, v1.h[2]\n"
        "fmla v25.8h, v11.8h, v2.h[2]\n"
        "fmla v27.8h, v11.8h, v3.h[2]\n"
        "fmla v29.8h, v11.8h, v4.h[2]\n"
        "fmla v31.8h, v11.8h, v5.h[2]\n"
        "fmla v20.8h, v12.8h, v0.h[3]\n"
        "fmla v22.8h, v12.8h, v1.h[3]\n"
        "fmla v24.8h, v12.8h, v2.h[3]\n"
        "fmla v26.8h, v12.8h, v3.h[3]\n"
        "fmla v28.8h, v12.8h, v4.h[3]\n"
        "fmla v30.8h, v12.8h, v5.h[3]\n"
        "fmla v21.8h, v13.8h, v0.h[3]\n"
        "fmla v23.8h, v13.8h, v1.h[3]\n"
        "fmla v25.8h, v13.8h, v2.h[3]\n"
        "fmla v27.8h, v13.8h, v3.h[3]\n"
        "fmla v29.8h, v13.8h, v4.h[3]\n"
        "fmla v31.8h, v13.8h, v5.h[3]\n"
        "fmla v20.8h, v14.8h, v0.h[4]\n"
        "fmla v22.8h, v14.8h, v1.h[4]\n"
        "fmla v24.8h, v14.8h, v2.h[4]\n"
        "fmla v26.8h, v14.8h, v3.h[4]\n"
        "fmla v28.8h, v14.8h, v4.h[4]\n"
        "fmla v30.8h, v14.8h, v5.h[4]\n"
        "fmla v21.8h, v15.8h, v0.h[4]\n"
        "fmla v23.8h, v15.8h, v1.h[4]\n"
        "fmla v25.8h, v15.8h, v2.h[4]\n"
        "fmla v27.8h, v15.8h, v3.h[4]\n"
        "fmla v29.8h, v15.8h, v4.h[4]\n"
        "fmla v31.8h, v15.8h, v5.h[4]\n"
        "fmla v20.8h, v16.8h, v0.h[5]\n"
        "fmla v22.8h, v16.8h, v1.h[5]\n"
        "fmla v24.8h, v16.8h, v2.h[5]\n"
        "fmla v26.8h, v16.8h, v3.h[5]\n"
        "fmla v28.8h, v16.8h, v4.h[5]\n"
        "fmla v30.8h, v16.8h, v5.h[5]\n"
        "fmla v21.8h, v17.8h, v0.h[5]\n"
        "fmla v23.8h, v17.8h, v1.h[5]\n"
        "fmla v25.8h, v17.8h, v2.h[5]\n"
        "fmla v27.8h, v17.8h, v3.h[5]\n"
        "fmla v29.8h, v17.8h, v4.h[5]\n"
        "fmla v31.8h, v17.8h, v5.h[5]\n"
        "fmla v20.8h, v18.8h, v0.h[6]\n"
        "fmla v22.8h, v18.8h, v1.h[6]\n"
        "fmla v24.8h, v18.8h, v2.h[6]\n"
        "fmla v26.8h, v18.8h, v3.h[6]\n"
        "fmla v28.8h, v18.8h, v4.h[6]\n"
        "fmla v30.8h, v18.8h, v5.h[6]\n"
        "fmla v21.8h, v19.8h, v0.h[6]\n"
        "fmla v23.8h, v19.8h, v1.h[6]\n"
        "fmla v25.8h, v19.8h, v2.h[6]\n"
        "fmla v27.8h, v19.8h, v3.h[6]\n"
        "fmla v29.8h, v19.8h, v4.h[6]\n"
        "fmla v31.8h, v19.8h, v5.h[6]\n"
        "fmla v20.8h, v6.8h, v0.h[7]\n"
        "fmla v22.8h, v6.8h, v1.h[7]\n"
        "fmla v24.8h, v6.8h, v2.h[7]\n"
        "fmla v26.8h, v6.8h, v3.h[7]\n"
        "fmla v28.8h, v6.8h, v4.h[7]\n"
        "fmla v30.8h, v6.8h, v5.h[7]\n"
        "fmla v21.8h, v7.8h, v0.h[7]\n"
        "fmla v23.8h, v7.8h, v1.h[7]\n"
        "fmla v25.8h, v7.8h, v2.h[7]\n"
        "fmla v27.8h, v7.8h, v3.h[7]\n"
        "fmla v29.8h, v7.8h, v4.h[7]\n"
        "fmla v31.8h, v7.8h, v5.h[7]\n"
        "185:"  // Height 6: Multiply loop: Main loop skip
        "cbz x27, 187f\n"
        "186:"  // Height 6: Multiply loop: Odd block loop
        "ldr h0, [x26], #0x2\n"
        "ldr h1, [x25], #0x2\n"
        "sub x27, x27, #0x1\n"
        "ldr h2, [x24], #0x2\n"
        "ldr h3, [x23], #0x2\n"
        "ldr h4, [x22], #0x2\n"
        "ldr h5, [x21], #0x2\n"
        "ldr q8, [x10, #0x0]\n"
        "ldr q9, [x10, #0x10]\n"
        "add x10, x10, #0x20\n"
        "fmla v20.8h, v8.8h, v0.h[0]\n"
        "fmla v22.8h, v8.8h, v1.h[0]\n"
        "fmla v24.8h, v8.8h, v2.h[0]\n"
        "fmla v26.8h, v8.8h, v3.h[0]\n"
        "fmla v28.8h, v8.8h, v4.h[0]\n"
        "fmla v30.8h, v8.8h, v5.h[0]\n"
        "fmla v21.8h, v9.8h, v0.h[0]\n"
        "fmla v23.8h, v9.8h, v1.h[0]\n"
        "fmla v25.8h, v9.8h, v2.h[0]\n"
        "fmla v27.8h, v9.8h, v3.h[0]\n"
        "fmla v29.8h, v9.8h, v4.h[0]\n"
        "fmla v31.8h, v9.8h, v5.h[0]\n"
        "cbnz x27, 186b\n"
        "187:"  // Height 6: Multiply loop: No odd multiplies
        "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
        "add x28, x28, #0x1\n"
        "cmp x28, x20\n"
        "bne 180b\n"
        "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
        "prfm pstl1keep, [x9, #0x0]\n"
        "add x26, x9, x20, LSL #1\n"
        "prfm pstl1keep, [x26, #0x0]\n"
        "add x25, x26, x20, LSL #1\n"
        "prfm pstl1keep, [x25, #0x0]\n"
        "add x24, x25, x20, LSL #1\n"
        "prfm pstl1keep, [x24, #0x0]\n"
        "add x23, x24, x20, LSL #1\n"
        "add x22, x23, x20, LSL #1\n"
        "prfm pstl1keep, [x23, #0x0]\n"
        "prfm pstl1keep, [x22, #0x0]\n"
        "tbz %x[flags], #1, 188f\n"
        "add x21, %x[args_ptr], %[offset_max]\n"
        "add x20, %x[args_ptr], %[offset_min]\n"
        "ld1r { v17.8h }, [x21]\n"
        "ld1r { v16.8h }, [x20]\n"
        "fmin v20.8h, v20.8h, v17.8h\n"
        "fmin v21.8h, v21.8h, v17.8h\n"
        "fmin v22.8h, v22.8h, v17.8h\n"
        "fmin v23.8h, v23.8h, v17.8h\n"
        "fmin v24.8h, v24.8h, v17.8h\n"
        "fmin v25.8h, v25.8h, v17.8h\n"
        "fmin v26.8h, v26.8h, v17.8h\n"
        "fmin v27.8h, v27.8h, v17.8h\n"
        "fmin v28.8h, v28.8h, v17.8h\n"
        "fmin v29.8h, v29.8h, v17.8h\n"
        "fmin v30.8h, v30.8h, v17.8h\n"
        "fmin v31.8h, v31.8h, v17.8h\n"
        "fmax v20.8h, v20.8h, v16.8h\n"
        "fmax v21.8h, v21.8h, v16.8h\n"
        "fmax v22.8h, v22.8h, v16.8h\n"
        "fmax v23.8h, v23.8h, v16.8h\n"
        "fmax v24.8h, v24.8h, v16.8h\n"
        "fmax v25.8h, v25.8h, v16.8h\n"
        "fmax v26.8h, v26.8h, v16.8h\n"
        "fmax v27.8h, v27.8h, v16.8h\n"
        "fmax v28.8h, v28.8h, v16.8h\n"
        "fmax v29.8h, v29.8h, v16.8h\n"
        "fmax v30.8h, v30.8h, v16.8h\n"
        "fmax v31.8h, v31.8h, v16.8h\n"
        "188:"  // Height 6: No activation
        "cmp x11, #0x10\n"
        "bge 197f\n"
        "tbz x11, #3, 192f\n"
        "st1 { v20.8h }, [x9], #0x10\n"
        "st1 { v22.8h }, [x26], #0x10\n"
        "st1 { v24.8h }, [x25], #0x10\n"
        "st1 { v26.8h }, [x24], #0x10\n"
        "st1 { v28.8h }, [x23], #0x10\n"
        "st1 { v30.8h }, [x22], #0x10\n"
        "tbz x11, #2, 190f\n"
        "str d21, [x9], #0x8\n"
        "str d23, [x26], #0x8\n"
        "str d25, [x25], #0x8\n"
        "str d27, [x24], #0x8\n"
        "str d29, [x23], #0x8\n"
        "str d31, [x22], #0x8\n"
        "tbz x11, #1, 189f\n"
        "st1 { v21.s }[2], [x9], #0x4\n"
        "st1 { v23.s }[2], [x26], #0x4\n"
        "st1 { v25.s }[2], [x25], #0x4\n"
        "st1 { v27.s }[2], [x24], #0x4\n"
        "st1 { v29.s }[2], [x23], #0x4\n"
        "st1 { v31.s }[2], [x22], #0x4\n"
        "tbz x11, #0, 196f\n"
        "st1 { v21.h }[6], [x9]\n"
        "st1 { v23.h }[6], [x26]\n"
        "st1 { v25.h }[6], [x25]\n"
        "st1 { v27.h }[6], [x24]\n"
        "st1 { v29.h }[6], [x23]\n"
        "st1 { v31.h }[6], [x22]\n"
        "b 196f\n"
        "189:"  // Height 6: Partial direct writeback: partial_1_12
        "tbz x11, #0, 196f\n"
        "st1 { v21.h }[4], [x9]\n"
        "st1 { v23.h }[4], [x26]\n"
        "st1 { v25.h }[4], [x25]\n"
        "st1 { v27.h }[4], [x24]\n"
        "st1 { v29.h }[4], [x23]\n"
        "st1 { v31.h }[4], [x22]\n"
        "b 196f\n"
        "190:"  // Height 6: Partial direct writeback: partial_2_8
        "tbz x11, #1, 191f\n"
        "str s21, [x9], #0x4\n"
        "str s23, [x26], #0x4\n"
        "str s25, [x25], #0x4\n"
        "str s27, [x24], #0x4\n"
        "str s29, [x23], #0x4\n"
        "str s31, [x22], #0x4\n"
        "tbz x11, #0, 196f\n"
        "st1 { v21.h }[2], [x9]\n"
        "st1 { v23.h }[2], [x26]\n"
        "st1 { v25.h }[2], [x25]\n"
        "st1 { v27.h }[2], [x24]\n"
        "st1 { v29.h }[2], [x23]\n"
        "st1 { v31.h }[2], [x22]\n"
        "b 196f\n"
        "191:"  // Height 6: Partial direct writeback: partial_1_8
        "tbz x11, #0, 196f\n"
        "str h21, [x9, #0x0]\n"
        "str h23, [x26, #0x0]\n"
        "str h25, [x25, #0x0]\n"
        "str h27, [x24, #0x0]\n"
        "str h29, [x23, #0x0]\n"
        "str h31, [x22, #0x0]\n"
        "b 196f\n"
        "192:"  // Height 6: Partial direct writeback: partial_4_0
        "tbz x11, #2, 194f\n"
        "str d20, [x9], #0x8\n"
        "str d22, [x26], #0x8\n"
        "str d24, [x25], #0x8\n"
        "str d26, [x24], #0x8\n"
        "str d28, [x23], #0x8\n"
        "str d30, [x22], #0x8\n"
        "tbz x11, #1, 193f\n"
        "st1 { v20.s }[2], [x9], #0x4\n"
        "st1 { v22.s }[2], [x26], #0x4\n"
        "st1 { v24.s }[2], [x25], #0x4\n"
        "st1 { v26.s }[2], [x24], #0x4\n"
        "st1 { v28.s }[2], [x23], #0x4\n"
        "st1 { v30.s }[2], [x22], #0x4\n"
        "tbz x11, #0, 196f\n"
        "st1 { v20.h }[6], [x9]\n"
        "st1 { v22.h }[6], [x26]\n"
        "st1 { v24.h }[6], [x25]\n"
        "st1 { v26.h }[6], [x24]\n"
        "st1 { v28.h }[6], [x23]\n"
        "st1 { v30.h }[6], [x22]\n"
        "b 196f\n"
        "193:"  // Height 6: Partial direct writeback: partial_1_4
        "tbz x11, #0, 196f\n"
        "st1 { v20.h }[4], [x9]\n"
        "st1 { v22.h }[4], [x26]\n"
        "st1 { v24.h }[4], [x25]\n"
        "st1 { v26.h }[4], [x24]\n"
        "st1 { v28.h }[4], [x23]\n"
        "st1 { v30.h }[4], [x22]\n"
        "b 196f\n"
        "194:"  // Height 6: Partial direct writeback: partial_2_0
        "tbz x11, #1, 195f\n"
        "str s20, [x9], #0x4\n"
        "str s22, [x26], #0x4\n"
        "str s24, [x25], #0x4\n"
        "str s26, [x24], #0x4\n"
        "str s28, [x23], #0x4\n"
        "str s30, [x22], #0x4\n"
        "tbz x11, #0, 196f\n"
        "st1 { v20.h }[2], [x9]\n"
        "st1 { v22.h }[2], [x26]\n"
        "st1 { v24.h }[2], [x25]\n"
        "st1 { v26.h }[2], [x24]\n"
        "st1 { v28.h }[2], [x23]\n"
        "st1 { v30.h }[2], [x22]\n"
        "b 196f\n"
        "195:"  // Height 6: Partial direct writeback: partial_1_0
        "str h20, [x9, #0x0]\n"
        "str h22, [x26, #0x0]\n"
        "str h24, [x25, #0x0]\n"
        "str h26, [x24, #0x0]\n"
        "str h28, [x23, #0x0]\n"
        "str h30, [x22, #0x0]\n"
        "196:"  // Height 6: Partial direct writeback: Done
        "b 198f\n"
        "197:"  // Height 6: Full writeback
        "str q20, [x9, #0x0]\n"
        "str q21, [x9, #0x10]\n"
        "add x9, x9, #0x20\n"
        "str q22, [x26, #0x0]\n"
        "str q23, [x26, #0x10]\n"
        "str q24, [x25, #0x0]\n"
        "str q25, [x25, #0x10]\n"
        "str q26, [x24, #0x0]\n"
        "str q27, [x24, #0x10]\n"
        "str q28, [x23, #0x0]\n"
        "str q29, [x23, #0x10]\n"
        "str q30, [x22, #0x0]\n"
        "str q31, [x22, #0x10]\n"
        "198:"  // Height 6: Writeback done
        "subs x11, x11, #0x10\n"
        "bgt 167b\n"
        "subs %x[m], %x[m], #0x6\n"
        "beq 200f\n"
        "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
        "tbz %x[flags], #3, 199f\n"
        "add x21, x21, #0x6\n"
        "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
        "b 1b\n"
        "199:"  // Update direct input
        "mov x20, #0xc\n"
        "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
        "b 1b\n"
        "200:"  // Exit
        : [input_ptr] "+&r"(input_ptr), [m] "+&r"(m)
        : [args_ptr] "r"(&ka), [flags] "r"(flags), [offset_max] "I"(offsetof(KernelArgs, maxval)),
          [offset_min] "I"(offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I"(offsetof(KernelArgs, B_ptr)),
          [offsetof_N] "I"(offsetof(KernelArgs, N)),
          [offsetof_input_initial_col] "I"(offsetof(KernelArgs, input_initial_col)),
          [offsetof_input_offset] "I"(offsetof(KernelArgs, input_offset)),
          [offsetof_num_strings] "I"(offsetof(KernelArgs, num_strings)),
          [offsetof_output_offset] "I"(offsetof(KernelArgs, output_offset)),
          [offsetof_output_ptr] "I"(offsetof(KernelArgs, output_ptr)),
          [offsetof_string_lengths] "I"(offsetof(KernelArgs, string_lengths))
        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14",
          "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
          "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
}

#endif  // Architectural features check.
